import pandas as pd
import numpy as np
import math
import itertools
import re
import os
import pathlib
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
sns.set(font="MS Gothic")
import scipy
# 学校データの読み込み
# tabデータにはヘッダーが存在しないため、直接カラム名を与えて読み込む
schools_columns = ['SCHID', 'SCHLURBN', 'GRDRANGE', 'FLAGGK', 'FLAGG1', 'FLAGG2', 'FLAGG3',
'GKENRMNT', 'GKCHAPT1', 'GKFRLNCH', 'GKBUSED', 'GKNATVAM', 'GKASIAN',
'GKBLACK', 'GKHSPANC', 'GKWHITE', 'GKOTHRAC', 'G1ENRMNT', 'G1AVGDAT',
'G1AVGDMB', 'G1CHAPT1', 'G1FRLNCH', 'G1BUSED', 'G1NATVAM', 'G1ASIAN',
'G1BLACK', 'G1HSPANC', 'G1WHITE', 'G1OTHRAC', 'G2ENRMNT', 'G2AVGDAT',
'G2AVGDMB', 'G2CHAPT1', 'G2FRLNCH', 'G2BUSED', 'G2NATVAM', 'G2ASIAN',
'G2BLACK', 'G2HSPANC', 'G2WHITE', 'G2OTHRAC', 'G3ENRMNT', 'G3AVGDAT',
'G3AVGDMB', 'G3CHAPT1', 'G3FRLNCH', 'G3BUSED', 'G3NATVAM', 'G3ASIAN',
'G3BLACK', 'G3HSPANC', 'G3WHITE', 'G3OTHRAC']
schools_df = pd.read_table('..\data\STAR_K-3_Schools.tab', header=None)
schools_df.columns = schools_columns
schools_df.head()
| SCHID | SCHLURBN | GRDRANGE | FLAGGK | FLAGG1 | FLAGG2 | FLAGG3 | GKENRMNT | GKCHAPT1 | GKFRLNCH | ... | G3AVGDMB | G3CHAPT1 | G3FRLNCH | G3BUSED | G3NATVAM | G3ASIAN | G3BLACK | G3HSPANC | G3WHITE | G3OTHRAC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 112038 | 4 | 5.0 | 1 | 1 | 1 | 1 | 330.0 | 1.0 | 67.0 | ... | 400.0 | 1.0 | 65.0 | 57.0 | 0.0 | 2.0 | 32.0 | 0.0 | 66.0 | 0.0 |
| 1 | 123056 | 3 | 8.0 | 1 | 1 | 1 | 1 | 620.0 | 1.0 | 35.0 | ... | 591.0 | 1.0 | 30.0 | 98.0 | 0.0 | 0.0 | 1.0 | 0.0 | 99.0 | 0.0 |
| 2 | 128068 | 3 | 8.0 | 1 | 0 | 0 | 0 | 540.0 | 1.0 | 40.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 128076 | 2 | 8.0 | 1 | 1 | 1 | 1 | 564.0 | 2.0 | 10.0 | ... | 504.0 | 2.0 | 23.0 | 90.0 | 0.0 | 0.0 | 3.0 | 0.0 | 97.0 | 0.0 |
| 4 | 128079 | 3 | 5.0 | 1 | 1 | 1 | 1 | 471.0 | 1.0 | 34.0 | ... | 504.0 | 1.0 | 32.0 | 72.0 | 0.0 | 0.0 | 5.0 | 0.0 | 95.0 | 0.0 |
5 rows × 53 columns
schools_df.shape
(80, 53)
列情報の詳細はstarUsersGuide.pdfのChapter.4 を確認
# 学校データの主要列(カテゴリ列)について、データの分布を確認
from matplotlib import pyplot as plt
%matplotlib inline
school_tgt_columns_cat = ['SCHLURBN', # 学校の都市性
'GRDRANGE', # 学校に所属する学生のレンジ(日本と違い、小学校or中学校などの明確な区分はなく、所属範囲は学校側の裁量による)
'FLAGGK', 'FLAGG1', 'FLAGG2', 'FLAGG3'] # 該当学年において、STARプロジェクトに参加したかどうか
# サブプロット準備
plot_x = 3
num_variables = len(school_tgt_columns_cat)
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(12, 10))
plot_position = 0
for i in school_tgt_columns_cat:
schools_df[i].value_counts().sort_index().plot(ax=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]],
kind="bar", title=i)
plot_position += 1
# 学校データの主要列(数値列)について、データの分布を確認
school_tgt_columns_num = ['GKENRMNT', 'G1ENRMNT', 'G2ENRMNT', 'G3ENRMNT', # 該当学年における在籍者数
'G1AVGDAT', 'G2AVGDAT', 'G3AVGDAT', # 該当学年における日次平均出席数(Average Dailyy Attendance)
'G1AVGDMB', 'G2AVGDMB', 'G3AVGDMB', # 該当学年における日次平均登録者数(Average Daily Memmberships)
'GKFRLNCH', 'G1FRLNCH', 'G2FRLNCH', 'G3FRLNCH', # 該当学年におけるフリーランチ利用割合
'GKBUSED', 'G1BUSED', 'G2BUSED', 'G3BUSED', # 該当学年におけるバス利用割合
'GKNATVAM', 'G1NATVAM', 'G2NATVAM', 'G3NATVAM', # 該当学年におけるネイティブアメリカンの構成割合
'GKASIAN', 'G1ASIAN', 'G2ASIAN', 'G3ASIAN', # 該当学年におけるアジア人の構成割合
'GKBLACK', 'G1BLACK', 'G2BLACK', 'G3BLACK', # 該当学年における黒人の構成割合
'GKHSPANC', 'G1HSPANC', 'G2HSPANC', 'G3HSPANC', # 該当学年におけるヒスパニックの構成割合
'GKWHITE', 'G1WHITE', 'G2WHITE', 'G3WHITE', # 該当学年における白人の構成割合
'GKOTHRAC', 'G1OTHRAC', 'G2OTHRAC', 'G3OTHRAC'] # 該当学年におけるその他人種の構成割合
# サブプロット準備
plot_x = 3
num_variables = len(school_tgt_columns_num)
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(36, 72))
plot_position = 0
for i in school_tgt_columns_num:
schools_df[i].plot(ax=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]], kind="hist", title=i)
plot_position += 1
plt.tight_layout()
plt.show()
フリーランチの利用率、バスの利用率は2つの山があることが分かる。
白人と黒人の構成率についても同様。その他の人種はそもそも人数が少ない
school_tgt_columns = school_tgt_columns_cat + school_tgt_columns_num
# 各列の欠損率を確認する
indexes = range(0, len(school_tgt_columns))
plt.figure(figsize=(20, 5))
tmp_null_df = schools_df[school_tgt_columns].isnull().sum()
plt.xticks(indexes, school_tgt_columns, rotation=90)
plt.bar(tmp_null_df.index, tmp_null_df / len(schools_df))
plt.title('各生徒のレコードにおけるデータ欠損率')
plt.tight_layout()
plt.show()
等の欠損率が高いのは、該当学年、該当人種が一人もいない際に、0でなく欠損値になるためと思われる
# 比較対象校生徒データの読み込み
# tabデータにはヘッダーが存在しないため、直接カラム名を与えて読み込む
comp_students_columns = ['STDNTID', 'GENDER', 'RACE', 'PRESCHOL', 'KINDRGRT', 'BIRTHMONTH',
'BIRTHDAY', 'BIRTHYEAR', 'FLAGG1C', 'FLAGG2C', 'FLAGG3C', 'G1SCHID',
'G1TCHID', 'G1CLASSSIZE', 'G1TREADSS', 'G1TMATHSS', 'G1TLISTSS',
'GQWORDSKILLSS', 'G1READBSRAW', 'G1MATHBSRAW', 'G1READBSPCT',
'G1MATHBSPCT', 'G1READBSOBJPCT', 'G1MATHBSOBJPCT', 'G2SCHID', 'G2TCHID',
'G2CLASSSIZE', 'G2TREADSS', 'G2TMATHSS', 'G2TLISTSS', 'G2WORDSKILLSS',
'G2READBSRAW', 'G2MATHBSRAW', 'G2READBSPCT', 'G2MATH_B',
'G2READBSOBJPCT', 'G3SCHID', 'G3TCHID', 'G3CLASSSIZE', 'G3TREADSS',
'G3TMATHSS', 'G3TLANGSS', 'G3TLISTSS', 'G3SOCIALSCISS', 'G3SPELLSS',
'G3VOCABSS', 'G3MATHCOMPUTSS', 'G3MATHNUMCONCSS', 'G3MATHAPPLSS',
'G3WORDSKILLSS', 'G3READBSRAW', 'G3MATHBSRAW', 'G3READBSPCT',
'G3MATHBSPCT', 'G3READBSOBJPCT', 'G3MATHBSOBJPCT']
comp_students_df = pd.read_table('..\data\Comparison_Students.tab', header=None)
comp_students_df.columns = comp_students_columns
comp_students_df.head()
| STDNTID | GENDER | RACE | PRESCHOL | KINDRGRT | BIRTHMONTH | BIRTHDAY | BIRTHYEAR | FLAGG1C | FLAGG2C | ... | G3MATHCOMPUTSS | G3MATHNUMCONCSS | G3MATHAPPLSS | G3WORDSKILLSS | G3READBSRAW | G3MATHBSRAW | G3READBSPCT | G3MATHBSPCT | G3READBSOBJPCT | G3MATHBSOBJPCT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 30001 | 0.0 | NaN | 0.0 | 1.0 | 11.0 | 22.0 | 1979.0 | 1 | 0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 30002 | 0.0 | NaN | 0.0 | 1.0 | 8.0 | 5.0 | 1980.0 | 1 | 0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 30003 | 0.0 | 1.0 | NaN | 1.0 | NaN | NaN | NaN | 0 | 1 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 30004 | 0.0 | 1.0 | 0.0 | 1.0 | NaN | NaN | NaN | 1 | 1 | ... | 652.0 | 582.0 | 571.0 | 570.0 | 23.0 | 38.0 | 4.0 | 8.0 | 40.0 | 53.0 |
| 4 | 30005 | 0.0 | NaN | 0.0 | 1.0 | 3.0 | 17.0 | 1980.0 | 1 | 0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 56 columns
comp_students_df.shape
(1780, 56)
比較対象校の選定方法については、starUsersGuide.pdfのp.7~8に下記の記述がある。
# 一部の変数を抜粋
comp_students_tgt_columns = ['G1CLASSSIZE', 'G2CLASSSIZE', 'G3CLASSSIZE', # 該当学年において、クラスの人数
# 目的変数候補
'G1TREADSS', 'G2TREADSS','G3TREADSS', # 該当学年におけるSAT読解能力のスコア
'G1TMATHSS', 'G2TMATHSS','G3TMATHSS', # 該当学年におけるSAT算数能力のスコア
'G1TLISTSS', 'G2TLISTSS','G3TLISTSS', # 該当学年におけるSATリスニング能力のスコア
# デモグラ変数
'GENDER', # 該当学生の性別
'RACE', # 該当学生の人種
'BIRTHMONTH', 'BIRTHDAY', 'BIRTHYEAR', # 該当学生の誕生月、日、年
'G1SCHID', 'G2SCHID', 'G3SCHID',# 該当学年における学校のID
# 教師に関する特徴量
'G1TCHID', 'G2TCHID', 'G3TCHID'] # 該当学年における教師のID
# サブプロット準備
plot_x = 4
num_variables = len(comp_students_tgt_columns)
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(12, 18))
plot_position = 0
# 主要な変数の分布を確認
for i in tqdm(comp_students_tgt_columns):
comp_students_df[i].plot(ax=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]], kind="hist", title=i)
plot_position += 1
plt.tight_layout()
plt.show()
0%| | 0/23 [00:00<?, ?it/s]
# 各生徒のレコードにおけるデータ欠損率
indexes = range(0, len(comp_students_tgt_columns))
plt.figure(figsize=(20,5))
tmp_null_df = comp_students_df[comp_students_tgt_columns].isnull().sum()
plt.xticks(indexes, comp_students_tgt_columns, rotation=90)
plt.bar(tmp_null_df.index, tmp_null_df / len(comp_students_df))
plt.title('比較対象校の各生徒のレコードにおけるデータ欠損率')
plt.tight_layout()
plt.show()
後述のSTAR参加学生データと比べ、全体的に欠損が多い傾向がある
欠損理由はstarUsersGuide.pdfには明示されていなかったが、おそらく
という事だと思われる。
データの欠損がアトランダムであるかは不明であり、クラスサイズの割当は明確にアトランダムではないということもあり、STAR参加学生のデータと同等のものであるとは断言できないため、一旦RCTの検証では当データは利用しないことにする。
# 高校データの読み込み
# tabデータにはヘッダーが存在しないため、直接カラム名を与えて読み込む
high_schools_columns = ['HSID', 'SCHLUJRBN', 'ENRLMENT', 'SENIORS', 'LOWGRADE', 'HGHGRADE',
'NUMGRADE', 'MNRTYPCT', 'FRLCHPCT', 'NOGRDPCT', 'MINRQMNT', 'MINMATH',
'MINSCIEN', 'MINFORLG', 'MINSOCST', 'MINCOMP', 'MINENGLS', 'ALGEBRA3',
'MATH4', 'PRECALCU', 'CALCULUS', 'PROBABIL', 'TRIGONOM', 'ANALYTIC',
'SOLIDGEO', 'LINALGBR', 'FRENCH', 'FREHILVL', 'SPANISH', 'SPNHILVL',
'LATIN', 'LATNHILVL', 'LNGHILVL']
high_schools_df = pd.read_table('..\data\STAR_High_Schools.tab', header=None)
high_schools_df.columns = high_schools_columns
high_schools_df.head()
| HSID | SCHLUJRBN | ENRLMENT | SENIORS | LOWGRADE | HGHGRADE | NUMGRADE | MNRTYPCT | FRLCHPCT | NOGRDPCT | ... | ANALYTIC | SOLIDGEO | LINALGBR | FRENCH | FREHILVL | SPANISH | SPNHILVL | LATIN | LATNHILVL | LNGHILVL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 106017 | 3 | 1500 | 375.000000 | 9.0 | 12.0 | 4.0 | 2.0 | 10.0 | 4.0 | ... | 0 | 0 | 1 | 1 | 5.0 | 1 | 5.0 | 0 | NaN | 5.0 |
| 1 | 112032 | 4 | 400 | 66.666667 | 7.0 | 12.0 | 6.0 | 5.0 | 20.0 | 12.0 | ... | 0 | 0 | 0 | 0 | NaN | 1 | 2.0 | 0 | NaN | 2.0 |
| 2 | 112034 | 2 | 1100 | 275.000000 | 9.0 | 12.0 | 4.0 | 35.0 | 30.0 | 12.0 | ... | 1 | 1 | 0 | 1 | 4.0 | 1 | 4.0 | 0 | NaN | 4.0 |
| 3 | 112036 | 4 | 428 | 107.000000 | 9.0 | 12.0 | 4.0 | 4.1 | 20.8 | NaN | ... | 0 | 0 | 0 | 1 | 2.0 | 1 | 2.0 | 0 | NaN | 2.0 |
| 4 | 118044 | 4 | 450 | 37.500000 | 1.0 | 12.0 | 12.0 | 0.0 | 63.0 | 12.0 | ... | 1 | 0 | 0 | 0 | NaN | 1 | 2.0 | 0 | NaN | 2.0 |
5 rows × 33 columns
high_schools_df.shape
(161, 33)
(抜粋)
このデータには、STAR期間後の教育レベルを測れるような特徴量はないと思われるため、一旦分析対象から外す
# STAR参加学生データの読み込み
# tabデータにはヘッダーが存在しないため、直接カラム名を与えて読み込む
students_df_columns = ['STDNTID', 'GENDER', 'RACE', 'BIRTHMON', 'BIRTHDAY', 'BIRTHYEA', 'FLAGSGK', 'FLAGSG1', 'FLAGSG2', 'FLAGSG3', 'FLAGGK', 'FLAGG1',
'FLAGG2', 'FLAGG3', 'FLAGG4', 'FLAGG5', 'FLAGG6', 'FLAGG7','FLAGG8', 'FLAGPRT4', 'FLAGIDN8', 'FLAGPRT8', 'FLAGSATA', 'FLAGHSCO',
'FLAGHSGR', 'GKCLASST', 'G1CLASST', 'G2CLASST', 'G3CLASST', 'CMPSTYPE', 'CMPSDURA', 'YEARSSTA', 'YEARSSMA','GKSCHID', 'GKSURBAN',
'SKTCHID', 'GKTGEN', 'GKTRACE', 'GKTHIGHD','GKTCAREE', 'GKTYEARS', 'GKCLASSS', 'GKFREELU', 'GKREPEAT','GKSPECED', 'GKSPECIN',
'GKPRESEN', 'GKABSENT', 'GKTREADS','GKTMATHS', 'GKTLISTS', 'GKWORDSK', 'GKMOTIVR', 'GKSELFCO','G1SCHID', 'G1SURBAN', 'G1TCHID',
'G1TGEN', 'G1TRACE', 'G1THIGHD', 'G1TCAREE', 'G1TYEARS', 'G1CLASSS', 'G1FREELU', 'G1PROMOT','G1SPECED', 'G1SPECIN', 'G1PRESEN',
'G1ABSENT', 'G1TREADS', 'G1TMATHS', 'G1TLISTS', 'G1WORDSK', 'G1READBS', 'G1MATHBS', 'G1READ_B', 'G1MATH_B', 'G1READ_C', 'G1MATH_C',
'G1MOTIVR','G1SELFCO', 'G2SCHID', 'G2SURBAN', 'G2TCHID', 'G2TGEN', 'G2TRACE', 'G2THIGHD', 'G2TCAREE', 'G2TYEARS', 'G2TTRAIN', 'G2CLASSS',
'G2FREELU', 'G2PROMOT', 'G2TREADS', 'G2TMATHS', 'G2TLISTS','G2WORDSK', 'G2READBS', 'G2MATHBS', 'G2READ_B', 'G2MATH_B',
'G2READ_C', 'G2MATH_C', 'G2MOTIVR', 'G2SELFCO', 'G3SCHID','G3SURBAN', 'G3TCHID', 'G3TGEN', 'G3TRACE', 'G3THIGHD', 'G3TCAREE',
'G3TYEARS', 'G3TTRAIN', 'G3CLASSS', 'G3FREELU', 'G3PROMOT', 'G3PRESEN', 'G3ABSENT', 'G3TREADS', 'G3TMATHS', 'G3TLANGS',
'G3TLISTS', 'G3SCIENC', 'G3SOCIAL', 'G3SPELLS', 'G3VOCABS', 'G2MATHCO', 'G3MATHNU', 'G3MATHAP', 'G3WORDSK', 'G3READBS',
'G3MATHBS', 'G3READ_B', 'G3MATH_B', 'G3READ_C', 'G3MATH_C', 'G3MOTIVR', 'G3SELFCO', 'G4SCHID', 'G4SURBAN', 'G4TCHID', 'G4TGEN',
'G4TRACE', 'G4NCLASS', 'G4NWHITE', 'G4NBLACK', 'G4NOTHER', 'G4PERNWH', 'G4NFREEL', 'G4TREADS', 'G4TMATHS', 'G4TLANGS',
'G4TBATTS', 'G4SCIENC', 'G4SOCIAL', 'G4READCO', 'G4SPELLS', 'G4VOCABS', 'G4MATHCO', 'G4MATH_A', 'G4LANGEX', 'G4LANGME',
'G4STUDYS', 'G4READBS', 'G4MATHBS', 'G4PTATTN', 'G4PTHWRK', 'G4PTOTH', 'G4PTMTRL', 'G4PTLATE', 'G4PTRIES', 'G4PTRSTL',
'G4PTDISC', 'G4PTWORK', 'G4PTIMPT', 'G4PTREPR', 'G4PTANOY', 'G4PTPERS', 'G4PTKNOW', 'G4PTEXTR', 'G4PTWTHD', 'G4PTEFRT',
'G4PTCRIT', 'G4PTASKS', 'G4PTALKS', 'G4PTINTV', 'G4PTEASY', 'G4PTCRTS', 'G4PTFNSH', 'G4PTRAIS', 'G4PTSEEK', 'G4PTDSRG',
'G4PTDISS', 'G4PTEXTC', 'G4PTPERF', 'G4PTSPED', 'G4PTEFFR', 'G4PTINIT', 'G4PTNONP', 'G4PTVALU', 'G5SCHID', 'G5TREADS',
'G5TMATHS', 'G5TLANGS', 'G5TBATTS', 'G5SCIENC', 'G5SOCIAL', 'G5READCO', 'G5SPELLS', 'G5VOCABS', 'G5MATHCO', 'G5MATH_A',
'G5LANGEX', 'G5LANGME', 'G5STUDYS', 'G5READBS', 'G5MATHBS', 'G6SCHID', 'G6TREADS', 'G6TMATHS', 'G6TLANGS', 'G6SCIENC',
'G6SOCIAL', 'G6READBS', 'G6MATHBS', 'G7SCHID', 'G7TREADS', 'G7TMATHS', 'G7TLANGS', 'G7TBATTS', 'G7SCIENC', 'G7SOCIAL',
'G7READCO', 'G7SPELLS', 'G7VOCABS', 'G7MATHCO', 'G7MATH_A', 'G7LANGEX', 'G7LANGME', 'G7STUDYS', 'G7READBS', 'G7MATHBS',
'G8SCHID', 'G8SURBAN', 'G8TREADS', 'G8TMATHS', 'G8TLANGS', 'G8TBATTS', 'G8SCIENC', 'G8SOCIAL', 'G8READCO', 'G8SPELLS',
'G8VOCABS', 'G8MATHCO', 'G7MATH_A.1', 'G8LANGEX', 'G8LANGME', 'G8STUDYS', 'G8READBS', 'G8MATHBS', 'G8IDPROU', 'G8IDRSPT',
'G8IDGDJB', 'G8IDATTN', 'G8IDACTV', 'G8IDIMPT', 'G8IDPOPU', 'G8IDUSLS', 'G8IDFRNL', 'G8IDCARE', 'G8IDPLAC', 'G8IDPROB',
'G8IDUSEF', 'G8IDFRNC', 'G8IDTRYG', 'G8IDFAVR', 'G8IDINTR', 'G8IDWAST', 'G8IDDROP', 'G8IDFRNU', 'G8IDMIMP', 'G8IDFRNW',
'G8IDFRNS', 'G8IDBLNG', 'G8IDVALU', 'G8IDTOTL', 'G8PEABSN', 'G8PEPRNT', 'G8PEATTN', 'G8PEMTRL', 'G8PEASGN', 'G8PELATE',
'G8PEPERS', 'G8PECRTS', 'G8PEMORE', 'G8PEANOY', 'G8PEVALU', 'G8PECRIT', 'G8PEDISC', 'G8PEREPR', 'G8PEABUS', 'G8PEDISS',
'G8PEEFFR', 'G8PEINIT', 'G8PENONP', 'G8PMABSN', 'G8PMPRNT', 'G8PMATTN', 'G8PMMTRL', 'G8PMASGN', 'G8PMLATE', 'G8PMPERS',
'G8PMCRTS', 'G8PMMORE', 'G8PMANOY', 'G8PMVALU', 'G8PMCRIT', 'G8PMDISC', 'G8PMREPR', 'G8PMABUS', 'G8PMDISS', 'G8PMEFFR',
'G8PMINIT', 'G8PMNONP', 'HSID', 'HSFRNCH1', 'HSFRNCH2', 'HSFRNCH3', 'HSFRNCH4', 'HSGRMN1', 'HSGRMN2', 'HSGRMN3', 'HSGRMN4', 'HSLATIN1',
'HSLATIN2', 'HSLATIN3', 'HSLATIN4', 'HSSPANI1', 'HSSPANI2', 'HSSPANI3', 'HSSPANI4', 'HSSPANI5', 'HSFLANG1', 'HSFLANG2',
'HSFLANG3', 'HSFLANG4', 'HSFLANGT', 'HSMATH1', 'HSMATH2', 'HSMATH3', 'HSMATH4', 'HSMATH5', 'HSMATHTO', 'HSCIENTO',
'HSGPAFLA', 'HSGPAMAT', 'HSGPASCI', 'HSGPAOVE', 'HSLVLFLA', 'HSLVLMTH', 'HSYRSCOR', 'HSCTSRC', 'HSSAT', 'HSACT', 'HSTEST',
'HSSATMAT', 'HSSATVER', 'HSSATTOT', 'HSACTCOM', 'HSACTTOT', 'HSACTENG', 'HSACTMAT', 'HSACTREA', 'HSACTSCI', 'HSSATCON',
'HSACTCON', 'HSGRDADD', 'HSGRDCOL']
students_df = pd.read_table('..\data\STAR_Students.tab', header=None)
students_df.columns = students_df_columns
students_df.head()
| STDNTID | GENDER | RACE | BIRTHMON | BIRTHDAY | BIRTHYEA | FLAGSGK | FLAGSG1 | FLAGSG2 | FLAGSG3 | ... | HSACTCOM | HSACTTOT | HSACTENG | HSACTMAT | HSACTREA | HSACTSCI | HSSATCON | HSACTCON | HSGRDADD | HSGRDCOL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10000 | 1.0 | 1.0 | 1.0 | 22.0 | 1979.0 | 0 | 1 | 1 | 1 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 10001 | 1.0 | 1.0 | 2.0 | 20.0 | 1980.0 | 1 | 0 | 0 | 0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 10002 | 2.0 | 2.0 | 7.0 | 21.0 | 1979.0 | 0 | 0 | 0 | 1 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 |
| 3 | 10003 | 1.0 | 1.0 | 5.0 | 28.0 | 1980.0 | 0 | 1 | 1 | 1 | ... | 22.0 | 89.0 | 16.0 | 21.0 | 28.0 | 24.0 | 860.0 | 22.0 | 1.0 | 1.0 |
| 4 | 10004 | 2.0 | 2.0 | 1.0 | 2.0 | 1980.0 | 0 | 0 | 1 | 1 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 |
5 rows × 379 columns
students_df.shape
(11601, 379)
(列情報抜粋)
ID、その他
デモグラフィック変数
FLAGSG*(K,1,2,3): STARプロジェクトにおいて、該当年においてSTARプロジェクトに参加したかを示すフラグ
FLAGG*(K,1,2,3,4,5,6,7,8): STARプロジェクトや、それ以降の年度において成績データを利用できるかどうかのフラグ
FLAGSATA: SAT,ACTのスコア(※3割程度しか使えない)
G*(K,1,2,3)CLASST: STARプロジェクトにおいて、該当年においてどのタイプのクラスに割り当てられたかどうかを示す
G*(K,1,2,3)SURBAN: 該当学年における学校の都会度
'G1TCHID', 'G2TCHID', 'G3TCHID'] # 該当学年における教師のID
G*(K,1,2,3)FREELU: 該当学年においてフリーランチを希望したかどうか
G*(K,1,2,3)TGEN: STARプロジェクトにおいて、該当年における教師の性別
G*(K,1,2,3)TRACE: STARプロジェクトにおいて、該当年における教師の人種
G*(K,1,2,3)THIGHD: STARプロジェクトにおいて、該当年における教師の最終学歴
G*(K,1,2,3)TCAREE: STARプロジェクトにおいて、該当年における教師の職位
G*(K,1,2,3)TYEARS: STARプロジェクトにおいて、該当年における教師の勤続年数students_df.head(5)
G*(1,2,3)PROMOT: 該当学年において進級を進められたかどうか
G*(K,1)SPECED: 該当学年において特別支援が必要になったかどうか
students_tgt_columns = ['STDNTID', # 該当学生のID
# STARプロジェクトに関する変数
'FLAGSGK', 'FLAGSG1', 'FLAGSG2', 'FLAGSG3', # 該当学年においてSTARプロジェクトに参加したかを示すフラグ
'GKCLASST', 'G1CLASST', 'G2CLASST', 'G3CLASST', # 該当学年においてどのタイプのクラスに割り当てられたかどうかを示す
'GKCLASSS', 'G1CLASSS', 'G2CLASSS', 'G3CLASSS', # 該当学年において、クラスの人数
'CMPSTYPE', # 4年分のクラスを所定のロジックに基づいてトータルに判定
# 上記ロジックは複雑なので、特にstarUsersGuide.pdfのp.10, 145,146を参照
'CMPSDURA', # 上記のCMPSTYPEについて、何年間該当タイプに所属したかを表す(1~4)
'YEARSSTA', # STARプロジェクトに何年参加していたかを表す(1~4)
'YEARSSMA', # SMALLクラスに何年所属していたかを表す(0~4)
# 目的変数候補
'HSACTCON', # SAT,ACTのスコア。SATのみの場合はACTのスコアに変換(※3割程度しか使えない)
'HSGRDCOL', # 高校を卒業したかどうか
'GKSELFCO', 'G1SELFCO', 'G2SELFCO','G3SELFCO', # 該当学年における「自己概念」心理尺度のスコア
'GKMOTIVR', 'G1MOTIVR', 'G2MOTIVR','G3MOTIVR', # 該当学年における「モチベーション」心理尺度のスコア
'GKTREADS', 'G1TREADS', 'G2TREADS','G3TREADS', # 該当学年におけるSAT読解能力のスコア
'GKTMATHS', 'G1TMATHS', 'G2TMATHS','G3TMATHS', # 該当学年におけるSAT算数能力のスコア
'GKTLISTS', 'G1TLISTS', 'G2TLISTS','G3TLISTS', # 該当学年におけるSATリスニング能力のスコア
# デモグラ変数
'GENDER', # 該当学生の性別
'RACE', # 該当学生の人種
'BIRTHMON', 'BIRTHDAY', 'BIRTHYEA', # 該当学生の誕生月、日、年
'GKSCHID', 'G1SCHID', 'G2SCHID', 'G3SCHID',# 該当学年における学校のID
'GKSURBAN', 'G1SURBAN', 'G2SURBAN', 'G3SURBAN', # 該当学年における学校の都会度
'GKFREELU', 'G1FREELU', 'G2FREELU', 'G3FREELU', # 該当学年においてフリーランチを希望したかどうか
# 教師に関する特徴量
'G1TCHID', 'G2TCHID', 'G3TCHID', # 該当学年における教師のID
'GKTGEN', 'G1TGEN', 'G2TGEN', 'G3TGEN', # 該当学年における教師の性別
'GKTRACE', 'G1TRACE', 'G2TRACE', 'G3TRACE', # 該当学年における教師の人種
'GKTHIGHD', 'G1THIGHD', 'G2THIGHD', 'G3THIGHD', # 該当学年における教師の最終学歴
'GKTYEARS', 'G1TYEARS', 'G2TYEARS', 'G3TYEARS', # 該当学年における教師の経験年数
'G2TTRAIN', 'G3TTRAIN'] # 該当学年における教師がSTAR参加教師向けの講習を受けたかどうか
# 1年生~2年生の間の夏休みに、STAR参加教師向けの講習を実施した。(なお、参加講師とそれ以外で教育成果の違いはみられなかったとのこと)
# 各生徒のレコードにおけるデータ欠損率
indexes = range(0, len(students_tgt_columns))
plt.figure(figsize=(20,5))
tmp_null_df = students_df[students_tgt_columns].isnull().sum()
plt.xticks(indexes, students_tgt_columns, rotation=90)
plt.bar(tmp_null_df.index, tmp_null_df / len(students_df))
plt.title('各生徒のレコードにおけるデータ欠損率')
plt.tight_layout()
plt.show()
# STAR 参加年数分布
students_df.YEARSSTA.value_counts().sort_index().plot(kind="bar", title='STAR参加合計年数')
plt.tight_layout()
plt.show()
# STAR年度ごとの参加者数分布
star_each_join = []
flags = ['FLAGSGK', 'FLAGSG1', 'FLAGSG2', 'FLAGSG3']
for flag in flags:
star_each_join.append(len(students_df[students_df[flag]==1]))
pd.Series(star_each_join, index=flags).plot(kind="bar", title='STAR年度ごとの参加者数')
plt.tight_layout()
plt.show()
各年度ごとに6,000~7,000人程度がそれぞれ参加し、合計参加年数もまちまち(1か4が多い)という状況
# 各学年データにおいて、主要列ごとの欠損率
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
tmp_columns.remove(flag_column)
tmp_null_df = tmp_df[tmp_columns].isnull().sum()
indexes = range(0, len(tmp_columns))
plt.figure(figsize=(20,5))
plt.xticks(indexes, tmp_columns, rotation=90)
plt.bar(tmp_null_df.index, tmp_null_df / len(tmp_df))
plt.title(grade + 'における各データの欠損率')
plt.tight_layout()
plt.show()
years = students_df['BIRTHYEA'].astype(str).apply(lambda s: s.split(".")[0].zfill(4))
months = students_df['BIRTHMON'].astype(str).apply(lambda s: s.split(".")[0].zfill(2))
days = students_df['BIRTHDAY'].astype(str).apply(lambda s: s.split(".")[0].zfill(2))
# 誕生年月日列の追加
students_df['BIRTH_DATE'] = pd.to_datetime(years + months + days, errors='coerce')
# 誕生年月列の追加
students_df['BIRTH_MONTH'] = years + '-' + months
students_df.BIRTH_MONTH = students_df.BIRTH_MONTH.where(~students_df.BIRTH_MONTH.str.contains('nan'), 'NULL')
students_df['BIRTHYEA'].value_counts().sort_index().plot(kind="bar", title='BIRTH_YEAR')
<AxesSubplot: title={'center': 'BIRTH_YEAR'}>
# STAR参加学生全員について、誕生日の分布を月ごとに可視化
# 赤く塗った部分がRedshirting(入学の遅延)
# 青く塗った部分がGrade Skipping(飛び級)
BIRTH_MONTH_VALUES = students_df.BIRTH_MONTH.value_counts().sort_index()
bm_index = range(0,len(BIRTH_MONTH_VALUES))
bm_values = BIRTH_MONTH_VALUES.values
plt.figure(figsize=(20,5))
plt.xticks(bm_index, BIRTH_MONTH_VALUES.index, rotation=90)
span_start = BIRTH_MONTH_VALUES.index.get_loc('1979-10') - 0.5
span_end = BIRTH_MONTH_VALUES.index.get_loc('1980-09') + 0.5
plt.axvspan(xmin=0, xmax=span_start, color="red", alpha=0.4)
plt.axvspan(xmin=span_start, xmax=span_end, color="grey", alpha=0.4)
plt.axvspan(xmin=span_end, xmax=len(BIRTH_MONTH_VALUES) - 1.5, color="blue", alpha=0.4)
plt.bar(bm_index, bm_values)
plt.show()
テネシー州の1990年時点でのcut-off-date(学年境界の閾値となる月日) は9月30日
http://www.ecs.org/clearinghouse/73/67/7367.pdf
1979年9月以前の生まれは、(恐らく) Academic Redshirting (意図的な入学時期の遅れ)を呼ばれるもの
1979年10月~1980年9月までの生まれは、通常の入学
1980年10月以降の生まれは、(恐らく) 飛び級
# 1979年10月1日~1980年9月30日(該当年度に満6歳)の生まれ
print('該当年度の生まれ: ' + str(len(students_df[(students_df['BIRTH_DATE']>'1979-09-30')&(students_df['BIRTH_DATE']<='1980-09-30')])))
# 生年月日未登録
print('生年月日未登録: ' + str(len(students_df[students_df['BIRTH_DATE'].isnull()])))
# 該当年度以前の生まれ
print('該当年度以前の生まれ(redshirting): ' + str(len(students_df[(students_df['BIRTH_DATE']<='1979-09-30')])))
# 該当年度以降の生まれ
print('該当年度以降の生まれ(飛び級): ' + str(len(students_df[(students_df['BIRTH_DATE']>='1980-09-30')])))
該当年度の生まれ: 8596 生年月日未登録: 71 該当年度以前の生まれ(redshirting): 2739 該当年度以降の生まれ(飛び級): 219
# 通常(Common)、入学延期(Redshirting)、飛び級(Skipping)を示すカテゴリ変数を導入
import datetime
def func_birthdate_categorical_string(x):
if pd.isnull(x):
return np.nan
elif x > datetime.datetime(1979,9,30) and x <= datetime.datetime(1980,9,30):
return 'Common'
elif x <= datetime.datetime(1979,9,30):
return 'Redshirting'
else:
return 'Skipping'
students_df['ENTRANCE_GRADE_STRING'] = students_df['BIRTH_DATE'].apply(func_birthdate_categorical_string)
# 通常(Common)、入学延期(Redshirting)、飛び級(Skipping)を示すカテゴリ変数を導入
import datetime
def func_birthdate_categorical(x):
if pd.isnull(x):
return np.nan
elif x > datetime.datetime(1979,9,30) and x <= datetime.datetime(1980,9,30):
return 0
elif x <= datetime.datetime(1979,9,30):
return 1
else:
return 2
students_df['ENTRANCE_GRADE'] = students_df['BIRTH_DATE'].apply(func_birthdate_categorical)
# 1980年9月30日を基準に、生年月日との日付差分を計算
def func_birthdate_datediff(x):
if pd.isnull(x):
return np.nan
else:
return (datetime.datetime(1980,9,30) - x).days
students_df['DATEDIFF'] = students_df['BIRTH_DATE'].apply(func_birthdate_datediff)
# 9月30日を基準に、生年月日との日付差分を計算
def func_birthdate_datediff_peryear(x):
if pd.isnull(x):
return np.nan
else:
if datetime.datetime(int(x.year), 9, 30) >= x:
return (x - datetime.datetime(int(x.year)-1, 9, 30)).days
else:
return (x - datetime.datetime(int(x.year), 9, 30)).days
students_df['DATEDIFF_FROM_ENTRANCE_DATE'] = students_df['BIRTH_DATE'].apply(func_birthdate_datediff_peryear)
# RACEを英語→日本語に置き換えた列を作成
RACE_JAPANESE_dict = {1: '白人', 2:'黒人', 3:'アジア人', 4:'ヒスパニック', 5:'ネイティブアメリカン', 6:'その他'}
students_df['RACE_JAPANESE'] = students_df['RACE'].replace(RACE_JAPANESE_dict)
pt=students_df.replace(RACE_JAPANESE_dict).pivot_table(index='RACE_JAPANESE', columns='GENDER', values='RACE',aggfunc=lambda x : len(x))
pt.apply(lambda x : x/sum(x), axis=1).plot.bar(alpha=0.6, figsize=(12,3), stacked=True)
pt=students_df.pivot_table(index='RACE_JAPANESE', columns='GKFREELU', values='RACE',aggfunc=lambda x : len(x))
pt.apply(lambda x : x/sum(x), axis=1).plot.bar(alpha=0.6, figsize=(12,3), stacked=True)
pt=students_df.pivot_table(index='RACE_JAPANESE', columns='ENTRANCE_GRADE', values='RACE',aggfunc=lambda x : len(x))
pt.apply(lambda x : x/sum(x), axis=1).plot.bar(alpha=0.6, figsize=(12,3), stacked=True)
<AxesSubplot: xlabel='RACE_JAPANESE'>
# 人数分布の確認
race_dict = {1: '白人', 2: '黒人', 3: 'アジア人',
4: 'ヒスパニック', 5: 'ネイティブアメリカン', 6: 'その他', np.nan: '不明'}
students_df['RACE'].replace(race_dict).value_counts().sort_index().plot(kind="bar", title='人種構成')
plt.tight_layout()
plt.show()
# ENRMNTデータ(学年次の所属人数データを付与)
tmp_df = pd.merge(students_df[['STDNTID', 'GKSCHID', 'G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'GKENRMNT']],
how='outer', left_on='GKSCHID', right_on='SCHID')
tmp_df = pd.merge(tmp_df[['STDNTID', 'GKENRMNT', 'GKSCHID','G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'G1ENRMNT']],
how='outer',left_on='G1SCHID', right_on='SCHID')
tmp_df = pd.merge(tmp_df[['STDNTID', 'GKENRMNT', 'G1ENRMNT', 'GKSCHID', 'G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'G2ENRMNT']],
how='outer', left_on='G2SCHID', right_on='SCHID')
tmp_df = pd.merge(tmp_df[['STDNTID', 'GKENRMNT', 'G1ENRMNT', 'G2ENRMNT', 'GKSCHID', 'G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'G3ENRMNT']],
how='outer', left_on='G3SCHID', right_on='SCHID')[['STDNTID', 'GKENRMNT', 'G1ENRMNT', 'G2ENRMNT','G3ENRMNT']]
# STAR期間中、最古&最新のENRMNTデータを取得する
def get_latest_enrmnt(x):
if np.isnan(x['G3ENRMNT'])==False:
return x['G3ENRMNT']
elif np.isnan(x['G2ENRMNT'])==False:
return x['G2ENRMNT']
elif np.isnan(x['G1ENRMNT'])==False:
return x['G1ENRMNT']
elif np.isnan(x['GKENRMNT'])==False:
return x['GKENRMNT']
else:
return np.nan
def get_oldest_enrmnt(x):
if np.isnan(x['GKENRMNT'])==False:
return x['GKENRMNT']
elif np.isnan(x['G1ENRMNT'])==False:
return x['G1ENRMNT']
elif np.isnan(x['G2ENRMNT'])==False:
return x['G2ENRMNT']
elif np.isnan(x['G3ENRMNT'])==False:
return x['G3ENRMNT']
else:
return np.nan
tmp_df['LATEST_ENRLMNT']=tmp_df.apply(lambda x:get_latest_enrmnt(x),axis=1)
tmp_df['OLDEST_ENRLMNT']=tmp_df.apply(lambda x:get_oldest_enrmnt(x),axis=1)
students_df = pd.merge(students_df, tmp_df[['STDNTID', 'GKENRMNT', 'G1ENRMNT', 'G2ENRMNT','G3ENRMNT',
'LATEST_ENRLMNT', 'OLDEST_ENRLMNT']], how='left', on='STDNTID')
# 学校規模の可視化
colors = list(mcolors.BASE_COLORS.keys())
ENRMNT_LISTS = ['GKENRMNT', 'G1ENRMNT', 'G2ENRMNT','G3ENRMNT']
fig, ax = plt.subplots(facecolor="w")
for (label, num) in zip(ENRMNT_LISTS, range(0, len(ENRMNT_LISTS))):
ax.hist(students_df[label], bins=20, alpha=0.3, histtype='stepfilled', color=colors[num], label=label)
ax.legend()
plt.show()
# ENRMNTデータ(学年の所属人数データを付与)
tmp_df = pd.merge(students_df[['STDNTID', 'GKSCHID', 'G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'SCHLURBN']],
how='outer', left_on='GKSCHID', right_on='SCHID')
tmp_df.rename(columns={'SCHLURBN': 'GKSCHLURBN'}, inplace=True)
tmp_df = pd.merge(tmp_df[['STDNTID', 'GKSCHLURBN', 'G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'SCHLURBN']],
how='outer',left_on='G1SCHID', right_on='SCHID')
tmp_df.rename(columns={'SCHLURBN': 'G1SCHLURBN'}, inplace=True)
tmp_df = pd.merge(tmp_df[['STDNTID', 'GKSCHLURBN', 'G1SCHLURBN','G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'SCHLURBN']],
how='outer', left_on='G2SCHID', right_on='SCHID')
tmp_df.rename(columns={'SCHLURBN': 'G2SCHLURBN'}, inplace=True)
tmp_df = pd.merge(tmp_df[['STDNTID', 'GKSCHLURBN', 'G1SCHLURBN', 'G2SCHLURBN', 'G1SCHID', 'G2SCHID', 'G3SCHID']], schools_df[['SCHID', 'SCHLURBN']],
how='outer', left_on='G3SCHID', right_on='SCHID')
tmp_df.rename(columns={'SCHLURBN': 'G3SCHLURBN'}, inplace=True)
tmp_df = tmp_df[['STDNTID', 'GKSCHLURBN', 'G1SCHLURBN', 'G2SCHLURBN','G3SCHLURBN']]
def get_latest_schurbn(x):
if np.isnan(x['G3SCHLURBN'])==False:
return x['G3SCHLURBN']
elif np.isnan(x['G2SCHLURBN'])==False:
return x['G2SCHLURBN']
elif np.isnan(x['G1SCHLURBN'])==False:
return x['G1SCHLURBN']
elif np.isnan(x['GKSCHLURBN'])==False:
return x['GKSCHLURBN']
else:
return np.nan
def get_oldest_schurbn(x):
if np.isnan(x['GKSCHLURBN'])==False:
return x['GKSCHLURBN']
elif np.isnan(x['G1SCHLURBN'])==False:
return x['G1SCHLURBN']
elif np.isnan(x['G2SCHLURBN'])==False:
return x['G2SCHLURBN']
elif np.isnan(x['G3SCHLURBN'])==False:
return x['G3SCHLURBN']
else:
return np.nan
tmp_df['LATEST_SCHLURBN']=tmp_df.apply(lambda x:get_latest_schurbn(x),axis=1)
tmp_df['OLDEST_SCHLURBN']=tmp_df.apply(lambda x:get_oldest_schurbn(x),axis=1)
students_df = pd.merge(students_df, tmp_df[['STDNTID', 'GKSCHLURBN', 'G1SCHLURBN', 'G2SCHLURBN','G3SCHLURBN',
'LATEST_SCHLURBN', 'OLDEST_SCHLURBN']], how='left', on='STDNTID')
# 学校都会度の可視化
colors = list(mcolors.BASE_COLORS.keys())
SCHLURBN_LISTS = ['GKSCHLURBN', 'G1SCHLURBN', 'G2SCHLURBN','G3SCHLURBN']
fig, ax = plt.subplots(facecolor="w")
for (label, num) in zip(SCHLURBN_LISTS, range(0, len(SCHLURBN_LISTS))):
ax.hist(students_df[label], bins=20, alpha=0.3, histtype='stepfilled', color=colors[num], label=label)
ax.legend()
plt.show()
# 各学年ごとの学校の都会度の違い
tmp_list = []
for i in SCHLURBN_LISTS:
tmp_df = students_df[i].value_counts(normalize=True, sort=False)
tmp_list.append(list(tmp_df.sort_index()))
tmp_df = pd.DataFrame(tmp_list, index=SCHLURBN_LISTS, columns=[1,2,3,4])
tmp_df.plot.bar(y=[1,2,3,4], alpha=0.6, figsize=(12,3), stacked=True)
<AxesSubplot: >
man_woman_dict = {1: '1: 男性', 2:'2: 女性', np.nan:'不明'}
students_df.GENDER.replace(man_woman_dict).value_counts().sort_index().plot(kind="bar", title='男女構成')
plt.tight_layout()
plt.show()
# フリーランチ利用状況の可視化
# 各STAR学年の参加フラグがたっているもののみ可視化
FREELU_LIST = [s for s in students_df.columns if 'FREELU' in s]
yes_no_dict = {1: '1: はい', 2:'2: いいえ', np.nan:'NULL'}
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
freelu = [s for s in FREELU_LIST if grade in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
tmp_df[freelu].replace(yes_no_dict).value_counts().sort_index().plot(kind="bar", title=freelu)
plt.tight_layout()
plt.show()
# 教師の性別
TGEN_LIST = [s for s in students_df.columns if ('TGEN' in s) & ~('MATCH' in s)]
man_woman_dict = {1: '1: 男性', 2:'2: 女性', np.nan:'NULL'}
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
tgen = [s for s in TGEN_LIST if grade in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
tmp_df[tgen].replace(man_woman_dict).value_counts().sort_index().plot(kind="bar", title=tgen)
plt.show()
Education at a glance 1998
https://www.oecd-ilibrary.org/education/education-at-a-glance-1998_eag-1998-en
1992時点でのアメリカのPrimary educationにおける女性比率は86%。(低学年ほど女性の比率が高い)
# 学年ごとに「教師と性別が一致したか」の特徴量作成
def columns_match_check(x, col1, col2):
if np.isnan(x[col1]):
return np.nan
elif np.isnan(x[col2]):
return np.nan
elif x[col1] == x[col2]:
return 1
else:
return 0
students_df = students_df.copy()
students_df['GKTGEN_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='GKTGEN', col2='GENDER'),axis=1)
students_df['G1TGEN_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='G1TGEN', col2='GENDER'),axis=1)
students_df['G2TGEN_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='G2TGEN', col2='GENDER'),axis=1)
students_df['G3TGEN_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='G3TGEN', col2='GENDER'),axis=1)
# 学年ごとに「教師と人種が一致したか」の特徴量作成
students_df = students_df.copy()
students_df['GKTRACE_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='GKTRACE', col2='RACE'),axis=1)
students_df['G1TRACE_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='G1TRACE', col2='RACE'),axis=1)
students_df['G2TRACE_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='G2TRACE', col2='RACE'),axis=1)
students_df['G3TRACE_MATCH'] = students_df.apply(lambda x:columns_match_check(x, col1='G3TRACE', col2='RACE'),axis=1)
# 「教師と性別が一致したか」の特徴量可視化
TGENMATCH_LIST = [s for s in students_df.columns if ('TGEN' in s) & ('MATCH' in s)]
match_dict = {0: '0: 一致しなかった', 1:'1: 一致した', np.nan:'NULL'}
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
tgen = [s for s in TGENMATCH_LIST if grade in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
tmp_df[tgen].replace(match_dict).value_counts().sort_index().plot(kind="bar", title=tgen)
plt.show()
# 「教師と人種が一致したか」の特徴量可視化
TRACEMATCH_LIST = [s for s in students_df.columns if ('TRACE' in s) & ('MATCH' in s)]
match_dict = {0: '0: 一致しなかった', 1:'1: 一致した', np.nan:'NULL'}
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
trace = [s for s in TRACEMATCH_LIST if grade in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
tmp_df[trace].replace(match_dict).value_counts().sort_index().plot(kind="bar", title=trace)
plt.show()
# 教師の最終学位の可視化
highest_degree_dict = {1:'1: 準学士', 2:'2: 学士', 3:'3: 修士', 4:'4: 修士+',
5:'6: 専門職学位', 6:'6: 博士', np.nan:'NULL'}
THIGHD_LIST = [s for s in students_df.columns if ('THIGHD' in s)]
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
thighd = [s for s in THIGHD_LIST if grade in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
tmp_df[thighd].replace(highest_degree_dict).value_counts().sort_index().plot(kind="bar", title=thighd)
plt.show()
# 教師の今後のキャリア希望について可視化
career_dict = {1:'1: 教師ルートを望まない', 2:'2: Apprentice(見習い)', 3:'3: Probation(実習生)', 4:'4: レベル1',
5:'5: レベル2', 6:'6: レベル3', 7:'7:ペンディング', np.nan:'NULL'}
TCAREE_LIST = [s for s in students_df.columns if ('TCAREE' in s)]
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
tcaree = [s for s in TCAREE_LIST if grade in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
tmp_df[tcaree].replace(career_dict).value_counts().sort_index().plot(kind="bar", title=tcaree)
plt.show()
# 教師の経験年数の可視化
TYEARS_LIST = [s for s in students_df.columns if ('TYEARS' in s)]
star_grades = ['GK', 'G1', 'G2', 'G3']
for grade in star_grades:
tmp_columns = [s for s in students_tgt_columns if grade in s]
flag_column = [s for s in tmp_columns if 'FLAGS' in s][0]
tyears = [s for s in TYEARS_LIST if grade in s][0]
tmp_df = students_df[(students_df[flag_column]==1)]
students_df[tyears].plot.hist(bins=np.arange(0, 50, 2), title=tyears)
plt.show()
心理尺度関係スコア(STAR期間中)
SAT関係スコア(STAR期間中)
STAR期間後のデータ
# STAR期間中の各目的変数を可視化
instar_target_variables = ['SELFCO', # 心理尺度「自己概念」
'MOTIVR', # 心理尺度「モチベーション」
'TLISTS', # SATリスニングテスト
'TREADS', # SAT読解力テスト
'TMATHS'] # SAT数学テスト
colors = list(mcolors.BASE_COLORS.keys())
star_grades = ['GK', 'G1', 'G2', 'G3']
for target in instar_target_variables:
star_re = re.compile(r'GK|G1|G2|G3')
TARGET_LIST = [s for s in students_df.columns if (target in s) & bool(star_re.search(s))]
fig, ax = plt.subplots(facecolor="w")
for (label, num) in zip(TARGET_LIST, range(0, len(TARGET_LIST))):
ax.hist(students_df[label], bins=50, alpha=0.3, histtype='stepfilled', color=colors[num], label=label)
ax.legend()
ax.set_title('STAR期間中の各{target}の分布'.format(target=target))
plt.show()
## STAR期間中について、心理尺度データの最新のものを抽出した列を作成
def get_motivr(x):
if np.isnan(x['G3MOTIVR'])==False:
return x['G3MOTIVR']
elif np.isnan(x['G2MOTIVR'])==False:
return x['G2MOTIVR']
elif np.isnan(x['G1MOTIVR'])==False:
return x['G1MOTIVR']
#elif np.isnan(x['GKMOTIVR'])==False:
# return x['GKMOTIVR']
else:
return np.nan
def get_selfco(x):
if np.isnan(x['G3SELFCO'])==False:
return x['G3SELFCO']
elif np.isnan(x['G2SELFCO'])==False:
return x['G2SELFCO']
elif np.isnan(x['G1SELFCO'])==False:
return x['G1SELFCO']
elif np.isnan(x['GKSELFCO'])==False:
return x['GKSELFCO']
else:
return np.nan
def get_motivr(x):
if np.isnan(x['G3MOTIVR'])==False:
return x['G3MOTIVR']
elif np.isnan(x['G2MOTIVR'])==False:
return x['G2MOTIVR']
elif np.isnan(x['G1MOTIVR'])==False:
return x['G1MOTIVR']
elif np.isnan(x['GKMOTIVR'])==False:
return x['GKMOTIVR']
else:
return np.nan
def get_selfco(x):
if np.isnan(x['G3SELFCO'])==False:
return x['G3SELFCO']
elif np.isnan(x['G2SELFCO'])==False:
return x['G2SELFCO']
elif np.isnan(x['G1SELFCO'])==False:
return x['G1SELFCO']
elif np.isnan(x['GKSELFCO'])==False:
return x['GKSELFCO']
else:
return np.nan
students_df = students_df.copy()
students_df['STAR_LATEST_MOTIVR'] = students_df.apply(lambda x:get_motivr(x),axis=1)
students_df['STAR_LATEST_SELFCO'] = students_df.apply(lambda x:get_selfco(x),axis=1)
# STAR期間後の目的変数について、列ごとの欠損率を確認
afterstar_variables = {'STAR_LATEST_MOTIVR': '最新のMOTIVR',
'STAR_LATEST_SELFCO': '最新のSELFCO',
'HSSATTOT': '高校時のSATのスコア',
'HSACTCOM': '高校時のACTのスコア',
'HSACTCON': '高校時でのSAT_ACT統合スコア',
'HSGRDADD': '高校を卒業したかどうか(2値変数)',
'HSGRDCOL': '高校を卒業したかどうか(3値以上)'}
students_df_afterstar = students_df[afterstar_variables].isnull().sum()
indexes = range(0,len(afterstar_variables))
plt.figure(figsize=(8, 5))
plt.xticks(indexes, afterstar_variables.values(), rotation=45)
plt.bar(students_df_afterstar.index, students_df_afterstar / len(students_df))
plt.tight_layout()
plt.show()
C:\Users\koji.nishimura\AppData\Local\Temp\ipykernel_21308\187639744.py:10: FutureWarning: Passing a dict as an indexer is deprecated and will raise in a future version. Use a list instead. students_df_afterstar = students_df[afterstar_variables].isnull().sum()
after_star_num = {'STAR_LATEST_MOTIVR': '最新のMOTIVR',
'STAR_LATEST_SELFCO': '最新のSELFCO',
'HSSATTOT': '高校時のSATのスコア',
'HSACTCOM': '高校時のACTのスコア',
'HSACTCON': '高校時でのSAT_ACT統合スコア'}
after_star_cat = {'HSGRDADD': '高校を卒業したかどうか(3値以上)',
'HSGRDCOL': '高校を卒業したかどうか(2値変数)'}
for i in after_star_num.keys():
students_df[i].plot(kind='hist', bins=20, title=after_star_num[i])
plt.show()
for i in after_star_cat.keys():
students_df[i].value_counts().sort_index().plot(kind="bar", title=after_star_cat[i])
plt.show()
# STAR期間中転校フラグを作成
def school_transfer_flag(x):
if (np.isnan(x['GKSCHID'])!=True)&(np.isnan(x['G1SCHID'])!=True)&(x['GKSCHID']!=x['G1SCHID']):
return 1
if (np.isnan(x['GKSCHID'])!=True)&(np.isnan(x['G2SCHID'])!=True)&(x['GKSCHID']!=x['G2SCHID']):
return 1
if (np.isnan(x['GKSCHID'])!=True)&(np.isnan(x['G3SCHID'])!=True)&(x['GKSCHID']!=x['G3SCHID']):
return 1
if (np.isnan(x['G1SCHID'])!=True)&(np.isnan(x['G2SCHID'])!=True)&(x['G1SCHID']!=x['G2SCHID']):
return 1
if (np.isnan(x['G1SCHID'])!=True)&(np.isnan(x['G3SCHID'])!=True)&(x['G1SCHID']!=x['G3SCHID']):
return 1
if (np.isnan(x['G2SCHID'])!=True)&(np.isnan(x['G3SCHID'])!=True)&(x['G2SCHID']!=x['G3SCHID']):
return 1
else:
return 0
students_df['SCH_TRANSF_FLAG'] = students_df.apply(lambda x:school_transfer_flag(x),axis=1)
# STAR期間中に転校があった生徒情報
print('STAR期間中に転校したことがある生徒の比率: {percentage} %'.format(percentage=round(students_df['SCH_TRANSF_FLAG'].sum() / len(students_df) * 100,2)))
STAR期間中に転校したことがある生徒の比率: 3.84 %
# SAT期間(4年間)のうち、同等の情報の中で最新のものを取得する関数を定義
def get_latest(x, columns):
col_names = [s for s in students_df.columns if columns in s]
try:
G3_col = [s for s in col_names if 'G3' in s][0]
G2_col = [s for s in col_names if 'G2' in s][0]
G1_col = [s for s in col_names if 'G1' in s][0]
GK_col = [s for s in col_names if 'GK' in s][0]
if np.isnan(x[G3_col])==False:
return x[G3_col]
elif np.isnan(x[G2_col])==False:
return x[G2_col]
elif np.isnan(x[G1_col])==False:
return x[G1_col]
elif np.isnan(x[GK_col])==False:
return x[GK_col]
except:
return 'invalid'
# SAT期間(4年間)のうち、同等の情報の中で最古のものを取得する関数を定義
def get_oldest(x, columns):
col_names = [s for s in students_df.columns if columns in s]
try:
G3_col = [s for s in col_names if 'G3' in s][0]
G2_col = [s for s in col_names if 'G2' in s][0]
G1_col = [s for s in col_names if 'G1' in s][0]
GK_col = [s for s in col_names if 'GK' in s][0]
if np.isnan(x[GK_col])==False:
return x[GK_col]
elif np.isnan(x[G1_col])==False:
return x[G1_col]
elif np.isnan(x[G2_col])==False:
return x[G2_col]
elif np.isnan(x[G3_col])==False:
return x[G3_col]
except:
return 'invalid'
# 対象の各項目について、「STAR期間中に取得できる最新の列情報」を取得する
# G3のデータが取得できればG3, G2のデータが取得できればG2…
latest_list = ['TYEARS', 'SURBAN', 'THIGHD', 'TGEN', 'TRACE', 'TCAREE', 'TYEARS','TGEN_MATCH', 'TRACE_MATCH', 'FREELU']
for i in latest_list:
latest_col_name = 'LATEST_' + i
students_df[latest_col_name] = students_df.apply(lambda x:get_latest(x, i), axis=1)
# 作成した列について、念のため'invalid'が挿入されていないか確認
students_df[map(lambda val: 'LATEST_' + val, latest_list)].head(5)
| LATEST_TYEARS | LATEST_SURBAN | LATEST_THIGHD | LATEST_TGEN | LATEST_TRACE | LATEST_TCAREE | LATEST_TYEARS | LATEST_TGEN_MATCH | LATEST_TRACE_MATCH | LATEST_FREELU | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15.0 | 3.0 | 3.0 | 2.0 | 1.0 | 4.0 | 15.0 | 0.0 | 1.0 | 2.0 |
| 1 | 5.0 | 2.0 | 2.0 | 2.0 | 2.0 | 4.0 | 5.0 | 0.0 | 0.0 | 2.0 |
| 2 | 17.0 | 2.0 | 2.0 | 2.0 | 2.0 | 6.0 | 17.0 | 1.0 | 1.0 | NaN |
| 3 | 15.0 | 3.0 | 2.0 | 2.0 | 1.0 | 4.0 | 15.0 | 0.0 | 1.0 | 2.0 |
| 4 | 25.0 | 1.0 | 3.0 | 2.0 | 1.0 | 1.0 | 25.0 | 1.0 | 0.0 | 1.0 |
例えば、幼稚園~2年生まで小規模クラスに割り当てられたが、3年生は通常クラスに割り当てられた、という場合は、過去の小規模クラスへの処置効果が残っていると想定できる。
上記を判定するための変数を作成。
# STARプロジェクト以降、「過去にSTARプロジェクト上何回少人数クラスに割り当てられたか」を表す変数を作成
students_df['G1_YEARSSMA_FORMER'] = students_df['GKCLASST'].apply(lambda x : 1 if x == 1 else 0)
students_df['G2_YEARSSMA_FORMER'] = students_df['GKCLASST'].apply(lambda x : 1 if x == 1 else 0) + \
students_df['G1CLASST'].apply(lambda x : 1 if x == 1 else 0)
students_df['G3_YEARSSMA_FORMER'] = students_df['GKCLASST'].apply(lambda x : 1 if x == 1 else 0) + \
students_df['G1CLASST'].apply(lambda x : 1 if x == 1 else 0) + \
students_df['G2CLASST'].apply(lambda x : 1 if x == 1 else 0)
# TODO: べた書き部分の修正
# 目的変数
EACHYEAR_TARGET_VARIABLES = ['SELFCO', # 心理尺度「自己概念」のスコア
'MOTIVR', # 心理尺度「モチベーション」のスコア
'TREADS', # SAT読解能力のスコア
'TMATHS', # SAT数学能力のスコア
'TLISTS'] # SATリスニング能力のスコア
# 説明変数
# カテゴリ変数、学年共通
EACHYEAR_COMMON_CATEGORY_EXPLANATORY_VARIABLES = ['GENDER', # 性別
'RACE', # 人種
'ENTRANCE_GRADE', # 入学年度
'FLAGGK'] # 幼稚園のデータを利用可能かどうか
# 数値変数、学年興津
EACHYEAR_COMMON_NUMERIC_EXPLANATORY_VARIABLES = ['DATEDIFF_FROM_ENTRANCE_DATE'] # 誕生日と入学日との差分日数
# カテゴリ変数、学年ごと
EACHYEAR_NOTCOMMON_CATEGORY_EXPLANATORY_VARIABLES = ['SURBAN', # 学校の都会度
'FREELU', # フリーランチを希望していたか
'TGEN', # 教師の性別
'TRACE', # 教師の人種
'THIGHD', # 教師の最終学歴
'TCAREE', # 教師のキャリア志望
'TRACE_MATCH', # 教師の人種が、生徒の人種と一致していたか
'TGEN_MATCH'] # 教師の性別が、生徒の性別と一致していたか
# 数値変数、学年ごと
EACHYEAR_NOTCOMMON_NUMERIC_EXPLANATORY_VARIABLES = ['ENRMNT', # 学校における在籍者数
'TYEARS'] # 教師の経験年数
# カテゴリ変数、3値以上、学年共通
EACHYEAR_COMMON_CATEGORY_OVER3_EXPLANATORY_VARIABLES = ['RACE', # 人種
'ENTRANCE_GRADE'] # 入学年度
# カテゴリ変数、3値以上、学年共通
EACHYEAR_NOTCOMMON_CATEGORY_OVER3_EXPLANATORY_VARIABLES = ['SURBAN', # 学校の都会度
'TGEN', # 教師の性別
'TRACE', # 教師の人種
'THIGHD', # 教師の最終学歴
'TCAREE']
STAR_GRADE = ['GK', 'G1', 'G2', 'G3']
RCT_COMPARISON_EACH_YEAR = {}
# 上記の情報をもとに、学年別の目的変数、説明変数(カテゴリ変数、数値変数を分割)を定義
for i in STAR_GRADE:
RCT_COMPARISON_EACH_YEAR[i]={}
RCT_COMPARISON_EACH_YEAR[i]['STAR_FLAG'] = [s for s in students_df if (i in s)&('FLAGS' in s)][0]
RCT_COMPARISON_EACH_YEAR[i]['CLASS_FLAG'] = [s for s in students_df if (i in s)&('CLASST' in s)][0]
tmp_list = []
for t in EACHYEAR_TARGET_VARIABLES:
tmp_list.append([s for s in students_df if (i in s)&(t in s)][0])
RCT_COMPARISON_EACH_YEAR[i]['TARGET_COLUMNS'] = tmp_list
tmp_list_2 = []
for category in EACHYEAR_NOTCOMMON_CATEGORY_EXPLANATORY_VARIABLES:
tmp_list_2.append([s for s in students_df if (i in s)&(category in s)][0])
RCT_COMPARISON_EACH_YEAR[i]['CATEGORY_EXP_VARIABLES'] = list(itertools.chain.from_iterable([EACHYEAR_COMMON_CATEGORY_EXPLANATORY_VARIABLES,
tmp_list_2]))
if i=='GK':
RCT_COMPARISON_EACH_YEAR[i]['CATEGORY_EXP_VARIABLES'].remove('FLAGGK')
tmp_list_3 = []
for numeric in EACHYEAR_NOTCOMMON_NUMERIC_EXPLANATORY_VARIABLES:
tmp_list_3.append([s for s in students_df if (i in s)&(numeric in s)][0])
if i=='G1':
tmp_list_3.append('G1_YEARSSMA_FORMER')
elif i=='G2':
tmp_list_3.append('G2_YEARSSMA_FORMER')
elif i=='G3':
tmp_list_3.append('G3_YEARSSMA_FORMER')
RCT_COMPARISON_EACH_YEAR[i]['NUMERIC_EXP_VARIABLES'] = list(itertools.chain.from_iterable([EACHYEAR_COMMON_NUMERIC_EXPLANATORY_VARIABLES,
tmp_list_3]))
tmp_list_4 = []
for category in EACHYEAR_NOTCOMMON_CATEGORY_OVER3_EXPLANATORY_VARIABLES:
tmp_list_4.append([s for s in students_df if (i in s)&(category in s)][0])
RCT_COMPARISON_EACH_YEAR[i]['OVER3_CATEGORY_EXP_VARIABLES'] = list(itertools.chain.from_iterable([EACHYEAR_COMMON_CATEGORY_OVER3_EXPLANATORY_VARIABLES,
tmp_list_4]))
print(RCT_COMPARISON_EACH_YEAR)
{'GK': {'STAR_FLAG': 'FLAGSGK', 'CLASS_FLAG': 'GKCLASST', 'TARGET_COLUMNS': ['GKSELFCO', 'GKMOTIVR', 'GKTREADS', 'GKTMATHS', 'GKTLISTS'], 'CATEGORY_EXP_VARIABLES': ['GENDER', 'RACE', 'ENTRANCE_GRADE', 'GKSURBAN', 'GKFREELU', 'GKTGEN', 'GKTRACE', 'GKTHIGHD', 'GKTCAREE', 'GKTRACE_MATCH', 'GKTGEN_MATCH'], 'NUMERIC_EXP_VARIABLES': ['DATEDIFF_FROM_ENTRANCE_DATE', 'GKENRMNT', 'GKTYEARS'], 'OVER3_CATEGORY_EXP_VARIABLES': ['RACE', 'ENTRANCE_GRADE', 'GKSURBAN', 'GKTGEN', 'GKTRACE', 'GKTHIGHD', 'GKTCAREE']}, 'G1': {'STAR_FLAG': 'FLAGSG1', 'CLASS_FLAG': 'G1CLASST', 'TARGET_COLUMNS': ['G1SELFCO', 'G1MOTIVR', 'G1TREADS', 'G1TMATHS', 'G1TLISTS'], 'CATEGORY_EXP_VARIABLES': ['GENDER', 'RACE', 'ENTRANCE_GRADE', 'FLAGGK', 'G1SURBAN', 'G1FREELU', 'G1TGEN', 'G1TRACE', 'G1THIGHD', 'G1TCAREE', 'G1TRACE_MATCH', 'G1TGEN_MATCH'], 'NUMERIC_EXP_VARIABLES': ['DATEDIFF_FROM_ENTRANCE_DATE', 'G1ENRMNT', 'G1TYEARS', 'G1_YEARSSMA_FORMER'], 'OVER3_CATEGORY_EXP_VARIABLES': ['RACE', 'ENTRANCE_GRADE', 'G1SURBAN', 'G1TGEN', 'G1TRACE', 'G1THIGHD', 'G1TCAREE']}, 'G2': {'STAR_FLAG': 'FLAGSG2', 'CLASS_FLAG': 'G2CLASST', 'TARGET_COLUMNS': ['G2SELFCO', 'G2MOTIVR', 'G2TREADS', 'G2TMATHS', 'G2TLISTS'], 'CATEGORY_EXP_VARIABLES': ['GENDER', 'RACE', 'ENTRANCE_GRADE', 'FLAGGK', 'G2SURBAN', 'G2FREELU', 'G2TGEN', 'G2TRACE', 'G2THIGHD', 'G2TCAREE', 'G2TRACE_MATCH', 'G2TGEN_MATCH'], 'NUMERIC_EXP_VARIABLES': ['DATEDIFF_FROM_ENTRANCE_DATE', 'G2ENRMNT', 'G2TYEARS', 'G2_YEARSSMA_FORMER'], 'OVER3_CATEGORY_EXP_VARIABLES': ['RACE', 'ENTRANCE_GRADE', 'G2SURBAN', 'G2TGEN', 'G2TRACE', 'G2THIGHD', 'G2TCAREE']}, 'G3': {'STAR_FLAG': 'FLAGSG3', 'CLASS_FLAG': 'G3CLASST', 'TARGET_COLUMNS': ['G3SELFCO', 'G3MOTIVR', 'G3TREADS', 'G3TMATHS', 'G3TLISTS'], 'CATEGORY_EXP_VARIABLES': ['GENDER', 'RACE', 'ENTRANCE_GRADE', 'FLAGGK', 'G3SURBAN', 'G3FREELU', 'G3TGEN', 'G3TRACE', 'G3THIGHD', 'G3TCAREE', 'G3TRACE_MATCH', 'G3TGEN_MATCH'], 'NUMERIC_EXP_VARIABLES': ['DATEDIFF_FROM_ENTRANCE_DATE', 'G3ENRMNT', 'G3TYEARS', 'G3_YEARSSMA_FORMER'], 'OVER3_CATEGORY_EXP_VARIABLES': ['RACE', 'ENTRANCE_GRADE', 'G3SURBAN', 'G3TGEN', 'G3TRACE', 'G3THIGHD', 'G3TCAREE']}}
# STARプロジェクト: 全期間の説明変数、目的変数等の辞書を定義
RCT_COMPARISON_ALL_YEAR = {'STAR_DURATION': 'YEARSSTA', # STARプロジェクトに何年参加していたかを表す(1~4)
'STAR_SMALL': 'YEARSSMA', # SMALLクラスに何年所属していたかを表す(0~4)
'STAR_CLASS_TYPE': 'CMPSTYPE', # STAR参加期間(最長4年分)のクラスを所定のロジックに基づいて判定
'TARGET_COLUMNS': ['STAR_LATEST_SELFCO', # STARプロジェクト期間内における最新の心理尺度「自己概念」のスコア
'STAR_LATEST_MOTIVR', # STARプロジェクト期間内における最新の心理尺度「自己概念」のスコア
'HSSATCON'],
'NUMERIC_EXP_VARIABLES': ['DATEDIFF_FROM_ENTRANCE_DATE', # 誕生日と入学日との差分日数
'LATEST_ENRLMNT', # STARプロジェクト期間内における最新の在籍年度の在籍者数
'LATEST_TYEARS'], # STARプロジェクト期間内における最新の在籍年度の教師の経験年数
'CATEGORY_EXP_VARIABLES': ['GENDER', # 性別
'RACE', # 人種
'FLAGGK', # 幼稚園のデータを利用可能かどうか
'ENTRANCE_GRADE', # 入学年度
'LATEST_SURBAN', # STARプロジェクト期間内における最新の在籍年度の学校の都会度
'LATEST_FREELU', # STARプロジェクト期間内における最新の在籍年度にフリーランチを希望していたか
'LATEST_TGEN', # STARプロジェクト期間内における最新の在籍年度の教師の性別
'LATEST_TRACE', # STARプロジェクト期間内における最新の在籍年度の教師の人種
'LATEST_THIGHD', # STARプロジェクト期間内における最新の在籍年度の教師の最終学歴
'LATEST_TRACE_MATCH', # STARプロジェクト期間内における最新の在籍年度の教師の人種が、生徒の人種と一致していたか
'LATEST_TGEN_MATCH', # STARプロジェクト期間内における最新の在籍年度の教師の人種が、生徒の人種と一致していたか
'LATEST_TCAREE'] # STARプロジェクト期間内における最新の在籍年度の教師のキャリア志望
}
# 各種目的変数群で比較
# 欠損値は一旦考慮しない
for key in RCT_COMPARISON_EACH_YEAR.keys():
# STAR参加学年ごとに必要な変数情報を取得する
TARGET_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
CATEGORY_VARIABLES = RCT_COMPARISON_EACH_YEAR[key]['CATEGORY_EXP_VARIABLES']
NUMERIC_EXP_VARIABLES = RCT_COMPARISON_EACH_YEAR[key]['NUMERIC_EXP_VARIABLES']
# 各学年ごとに、該当年度の参加フラグがある生徒について、小規模クラス、通常クラスのデータに分割する
small_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME]==1)]
normal_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME].isin([2, 3]))]
# pandas.DataFrame.plotサブプロットレイアウト準備
plot_x = 4
num_variables = len(CATEGORY_VARIABLES) + len(NUMERIC_EXP_VARIABLES)
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(20, 18))
plot_position = 0
# カテゴリ変数について処置群、対象群の分布を可視化する
for CATEGORY in CATEGORY_VARIABLES:
category_uniques = np.sort(students_df[CATEGORY].unique())
small_cat_rate = small_df[CATEGORY].value_counts().map(lambda x: x / small_df[CATEGORY].count())
normal_cat_rate = normal_df[CATEGORY].value_counts().map(lambda x: x / normal_df[CATEGORY].count())
new_df = pd.DataFrame(index=category_uniques[~np.isnan(category_uniques)])
new_df = new_df.merge(small_cat_rate, left_index=True, right_index=True).merge(normal_cat_rate,left_index=True, right_index=True)
new_df.columns=['SMALL','NORMAL']
new_df.T.plot(ax=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]],
kind='bar', stacked=True, title=CATEGORY)
plot_position += 1
# 数値変数について処置群、対象群の分布を可視化する
for NUMERIC in NUMERIC_EXP_VARIABLES:
small_df.rename(columns={NUMERIC: 'SMALL'})['SMALL'].plot(kind='kde',
legend=True,
ax=axes[divmod(plot_position, plot_x)[0],
divmod(plot_position, plot_x)[1]],
title=NUMERIC)
normal_df.rename(columns={NUMERIC: 'NORMAL'})['NORMAL'].plot(kind='kde',
legend=True,
ax=axes[divmod(plot_position, plot_x)[0],
divmod(plot_position, plot_x)[1]])
plot_position += 1
plt.tight_layout()
plt.show()
# 各種目的変数群で比較
# 欠損値は一旦除外
for key in RCT_COMPARISON_EACH_YEAR.keys():
# STAR参加学年ごとに必要な変数情報を取得する
TARGET_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
CATEGORY_VARIABLES = RCT_COMPARISON_EACH_YEAR[key]['CATEGORY_EXP_VARIABLES']
NUMERIC_EXP_VARIABLES = RCT_COMPARISON_EACH_YEAR[key]['NUMERIC_EXP_VARIABLES']
# 各学年ごとに、該当年度の参加フラグがある生徒について、小規模クラス、通常クラスのデータに分割する
small_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME]==1)]
normal_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME].isin([2, 3]))]
# pandas.DataFrame.plotサブプロットレイアウト準備
plot_x = 3
num_variables = len(TARGET_COLUMNS_LIST)
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(20, 8))
plot_position = 0
# 目的変数について処置群、対象群の分布を可視化する
for TARGET in TARGET_COLUMNS_LIST:
small_df.rename(columns={TARGET: 'SMALL'})['SMALL'].plot(kind='hist',
density=True,
legend=True,
alpha=0.3,
bins=30,
ax=axes[divmod(plot_position, plot_x)[0],
divmod(plot_position, plot_x)[1]],
title=NUMERIC)
normal_df.rename(columns={TARGET: 'NORMAL'})['NORMAL'].plot(kind='hist',
density=True,
legend=True,
alpha=0.3,
bins=30,
ax=axes[divmod(plot_position, plot_x)[0],
divmod(plot_position, plot_x)[1]])
axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]].set_title(TARGET)
plot_position += 1
plt.tight_layout()
plt.show()
# 各列情報の日本語化辞書の定義
race_dict = {1: '白人', 2: '黒人', 3: 'アジア人',
4: 'ヒスパニック', 5: 'ネイティブアメリカン', 6: 'その他', np.nan: '不明'}
gender_dict = {1: '男性', 2:'女性', np.nan:'不明'}
yes_no_dict = {1: 'はい', 2:'いいえ', np.nan:'不明'}
entrance_grade_dict = {0: '飛び級', 1:'通常', 2:'Redshirting', np.nan: '不明'}
surban_dict = {1: '都会', 2:'郊外', 3:'田舎', 4:'都心近接低所得地域', np.nan: '不明'}
career_dict = {1:'教師ルートを望まない', 2:'Apprentice(見習い)', 3:'Probation(実習生)', 4:'レベル1',
5:'レベル2', 6:'レベル3', 7:'ペンディング', np.nan:'不明'}
highest_degree_dict = {1:'準学士', 2:'学士', 3:'修士', 4:'修士+',
5:'専門職学位', 6:'博士', np.nan:'不明'}
yes_no_dict_2 = {1: 'はい', 0:'いいえ', np.nan:'不明'}
# 設定
fig, axes = plt.subplots(facecolor="white", nrows=4, ncols=4, figsize=(20, 20))
students_df.RACE.replace(race_dict).value_counts().reindex(index=race_dict.values()).plot(ax=axes[0, 0],
kind="bar",
title='学生の人種構成',
fontsize=12)
axes[0, 0].tick_params(axis='x', labelrotation=45)
students_df.GENDER.replace(gender_dict).value_counts().reindex(index=gender_dict.values()).plot(ax=axes[0, 1],
kind="bar",
title='学生の性別構成')
students_df.GKFREELU.replace(yes_no_dict).value_counts().reindex(index=yes_no_dict.values()).plot(ax=axes[0, 2],
kind="bar",
title='幼稚園でフリーランチを希望したかどうか')
students_df.ENTRANCE_GRADE.replace(entrance_grade_dict).value_counts().reindex(index=entrance_grade_dict.values()).plot(ax=axes[0, 3],
kind="bar",
title='入学年度')
axes[0, 3].tick_params(axis='x', labelrotation=45)
students_df.DATEDIFF_FROM_ENTRANCE_DATE.plot(kind='hist', ax=axes[1, 0],
title='入学日と誕生日の日数差分', bins=30)
students_df.GKENRMNT.plot(kind='hist', ax=axes[1, 1],
title='所属幼稚園の生徒数', bins=30)
students_df.GKSURBAN.replace(surban_dict).value_counts().reindex(index=surban_dict.values()).plot(ax=axes[1, 2],
kind="bar",
title='幼稚園の都会度')
students_df.GKTRACE.replace(race_dict).value_counts().reindex(index=race_dict.values()).plot(ax=axes[1, 3],
kind="bar",
title='教師の人種構成')
axes[1, 3].tick_params(axis='x', labelrotation=45)
students_df.GKTRACE_MATCH.replace(yes_no_dict_2).value_counts().reindex(index=yes_no_dict_2.values()).plot(ax=axes[2, 0],
kind="bar",
title='教師と生徒の人種が一致したか')
students_df.GKTGEN.replace(gender_dict).value_counts().reindex(index=gender_dict.values()).plot(ax=axes[2, 1],
kind="bar",
title='教師の性別構成')
students_df.GKTGEN_MATCH.replace(yes_no_dict_2).value_counts().reindex(index=yes_no_dict_2.values()).plot(ax=axes[2, 2],
kind="bar",
title='教師と生徒の性別が一致したか')
students_df.GKTCAREE.replace(career_dict).value_counts().reindex(index=career_dict.values()).plot(ax=axes[2, 3],
kind="bar",
title='教師のキャリア志望',
fontsize=12)
axes[2, 3].tick_params(axis='x', labelrotation=45)
students_df.GKTHIGHD.replace(highest_degree_dict).value_counts().reindex(index=highest_degree_dict.values()).plot(ax=axes[3, 0],
kind="bar",
title='教師の最終学歴',
fontsize=12)
axes[3, 0].tick_params(axis='x', labelrotation=45)
students_df.GKTYEARS.plot(kind='hist', ax=axes[3, 1],
title='教師の経験年数', bins=20)
plt.tight_layout()
plt.show()
# 各年次ごとの説明変数・目的変数間の相関係数(スピアマン)行列を出力
colormap = plt.cm.RdBu
for year in RCT_COMPARISON_EACH_YEAR:
tmp_columns = list(itertools.chain.from_iterable([RCT_COMPARISON_EACH_YEAR[year]['TARGET_COLUMNS'],
RCT_COMPARISON_EACH_YEAR[year]['CATEGORY_EXP_VARIABLES'],
RCT_COMPARISON_EACH_YEAR[year]['NUMERIC_EXP_VARIABLES']]))
tmp_df = students_df[students_df[RCT_COMPARISON_EACH_YEAR[year]['STAR_FLAG']]==1]
plt.figure(figsize=(14,12))
plt.title(year)
sns.heatmap(tmp_df[tmp_columns].astype(float).corr(method='spearman'),linewidths=0.1,vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
plt.tight_layout()
plt.show()
# 相関比(カテゴリ変数と数値変数間の相関)を算出するための関数を定義
def correlation_ratio(df, category_series_column, numerical_series_column):
# 全変動を計算
total_mean = df[numerical_series_column].mean()
total_variation = sum(df[numerical_series_column].map(lambda x: (x - total_mean)**2))
# 級間変動を計算
subgroup_variation = 0
for unique in df[category_series_column].unique():
tmp_unique_df = df[df[category_series_column]==unique]
subgroup_count = tmp_unique_df[numerical_series_column].count()
subgroup_mean = tmp_unique_df[numerical_series_column].mean()
subgroup_variation += subgroup_count*((total_mean - subgroup_mean)**2)
# 相関比を計算
correlation_ratio = subgroup_variation / total_variation
return correlation_ratio
colormap = plt.cm.RdBu
# 3値以上のカテゴリ値と目的変数間の相関比行列を表示
for year in RCT_COMPARISON_EACH_YEAR:
target_columns = RCT_COMPARISON_EACH_YEAR[year]['TARGET_COLUMNS']
exp_columns = RCT_COMPARISON_EACH_YEAR[year]['OVER3_CATEGORY_EXP_VARIABLES']
matrix_colrows = list(itertools.chain.from_iterable([target_columns, exp_columns]))
cor_df = pd.DataFrame(index=matrix_colrows, columns=matrix_colrows)
for i, j in itertools.product(exp_columns, target_columns):
tmp_df = students_df[~(students_df[i].isnull())&
~(students_df[j].isnull())&
(students_df[RCT_COMPARISON_EACH_YEAR[year]['STAR_FLAG']]==1)]
cor = correlation_ratio(tmp_df, i, j)
cor_df[i][j] = cor
cor_df[j][i] = cor
plt.figure(figsize=(14,12))
plt.title(year)
sns.heatmap(cor_df.fillna(np.nan),linewidths=0.1,vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
plt.show()
# 3値以上のカテゴリ値と目的変数間のバイオリンプロット行列を表示
for year in RCT_COMPARISON_EACH_YEAR:
target_columns = RCT_COMPARISON_EACH_YEAR[year]['TARGET_COLUMNS']
exp_columns = RCT_COMPARISON_EACH_YEAR[year]['OVER3_CATEGORY_EXP_VARIABLES']
for i in exp_columns:
# 5列のプロット
num_variables = len(target_columns)
plot_x = 3
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(10, 6))
fig.suptitle(year+'時の目的変数とカテゴリ変数' + i + 'の分布', x=0.5, y=0.96)
plot_position = 0
for j in target_columns:
tmp_df = students_df[~(students_df[i].isnull())&
~(students_df[j].isnull())&
(students_df[RCT_COMPARISON_EACH_YEAR[year]['STAR_FLAG']]==1)]
sns.violinplot(x=i, y=j, data=tmp_df, ax=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]])
plot_position += 1
plt.tight_layout()
plt.show()
import copy
colormap = plt.cm.RdBu
for year in RCT_COMPARISON_EACH_YEAR:
target_columns = copy.copy(RCT_COMPARISON_EACH_YEAR[year]['TARGET_COLUMNS'])
common_exp_list = ['GENDER', 'RACE', 'ENTRANCE_GRADE']
grade_exp_tmp_list = ['FREELU', 'SURBAN', 'THIGHD', 'TCAREE']
grade_exp_list = []
for i in grade_exp_tmp_list:
grade_exp_list.append([s for s in RCT_COMPARISON_EACH_YEAR[year]['CATEGORY_EXP_VARIABLES'] if i in s][0])
exp_list = list(itertools.chain.from_iterable([common_exp_list, grade_exp_list]))
tmp_df = students_df[students_df[RCT_COMPARISON_EACH_YEAR[year]['STAR_FLAG']]==1].sample(n=200, random_state=123)
for j in exp_list:
tmp_columns = copy.copy(target_columns)
tmp_columns.append(j)
sns.pairplot(data=tmp_df[tmp_columns], hue=j, palette='bright')
plt.show()
# STAR年次別: 目的変数の分布とProbplotを表示
# Probplotが直線上に出力されていれば正規性が強いと想定
for key in RCT_COMPARISON_EACH_YEAR.keys():
COLUMNS_LIST = list(itertools.chain.from_iterable([RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS'],
RCT_COMPARISON_EACH_YEAR[key]['NUMERIC_EXP_VARIABLES']]))
# サブプロット準備
plot_x = 6
num_variables = len(COLUMNS_LIST) * 2
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(24, 16))
plot_position = 0
for target in COLUMNS_LIST:
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
students_df[(students_df[STAR_FLAG_NAME]==1)&
(students_df[CLASS_FLAG_NAME]==1)][target].plot(kind='hist', bins=30, legend=True, ax=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]])
plot_position += 1
scipy.stats.probplot(students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME]==1)][target], plot=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]],)
axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]].set_title("Probplot for " + target)
plot_position += 1
plt.tight_layout()
plt.show()
# STAR全年分: 目的変数の分布とProbplotを表示
# Probplotが直線上に出力されていれば正規性が強いと想定
COLUMNS_LIST = list(itertools.chain.from_iterable([RCT_COMPARISON_ALL_YEAR['TARGET_COLUMNS'],
RCT_COMPARISON_ALL_YEAR['NUMERIC_EXP_VARIABLES']]))
CLASS_FLAG_NAME = RCT_COMPARISON_ALL_YEAR['STAR_CLASS_TYPE']
# サブプロット準備
plot_x = 6
num_variables = len(COLUMNS_LIST * 2)
plot_y = num_variables // plot_x if num_variables % plot_x == 0 else num_variables // plot_x + 1
fig, axes = plt.subplots(facecolor="white", nrows=plot_y, ncols=plot_x, figsize=(24, 12))
plot_position = 0
for target in COLUMNS_LIST:
students_df[(students_df[CLASS_FLAG_NAME]==1)][target].plot(ax=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]],
kind='hist', bins=30, legend=True)
plot_position += 1
scipy.stats.probplot(students_df[(students_df[CLASS_FLAG_NAME]==1)][target], plot=axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]])
axes[divmod(plot_position, plot_x)[0], divmod(plot_position, plot_x)[1]].set_title("Probplot for " + target)
plot_position += 1
plt.tight_layout()
plt.show()
概ねすべての数値変数で正規分布しているものはなさそうにみえる
# シャピロウィルク検定によって正規性を確認
shapiro_list = []
for key in RCT_COMPARISON_EACH_YEAR.keys():
TARGET_COLUMNS_LIST = list(itertools.chain.from_iterable([RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS'],
RCT_COMPARISON_EACH_YEAR[key]['NUMERIC_EXP_VARIABLES']]))
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
tmp_df = students_df[students_df[STAR_FLAG_NAME]==1]
for target in TARGET_COLUMNS_LIST:
_, shapiro = scipy.stats.shapiro(tmp_df[~tmp_df[target].isnull()][target])
shapiro_list.append([target, shapiro])
shapiro_df = pd.DataFrame(shapiro_list, columns=['変数名', '正規性p値'])
c:\Users\koji.nishimura\venv1\.venv\lib\site-packages\scipy\stats\_morestats.py:1761: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
サンプルサイズが多すぎることでアラートが出ており
かつ多重検定の補正もしていないので参考程度に
shapiro_df[shapiro_df['正規性p値'] >= 0.05]
| 変数名 | 正規性p値 |
|---|
shapiro_df[shapiro_df['正規性p値'] >= 0.001]
| 変数名 | 正規性p値 |
|---|
p値が低い(正規性がある)ものは存在しなさそう
※なお、少人数の方が教育効果が高いという結論を検証するために、以降全て片側検定としている
# 各種目的変数群で比較
each_listwise_target_diff = []
for key in RCT_COMPARISON_EACH_YEAR.keys():
TARGET_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
small_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME]==1)]
normal_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME].isin([2, 3]))]
for target in TARGET_COLUMNS_LIST:
# 効果量(平均値の差)
mean_diff = small_df[~(small_df[target].isnull())][target].mean() - normal_df[~(normal_df[target].isnull())][target].mean()
# cohenのd
small_len = len(small_df[~(small_df[target].isnull())][target]) - 1
normal_len = len(normal_df[~(normal_df[target].isnull())][target]) -1
small_var = np.var(small_df[~(small_df[target].isnull())][target])
normal_var = np.var(normal_df[~(normal_df[target].isnull())][target])
cohen_d = mean_diff / np.sqrt((small_len * small_var + normal_len * normal_var) / (small_len + normal_len))
# マンホイットニーのU検定
mannwhitneyu_test = scipy.stats.mannwhitneyu(small_df[~(small_df[target].isnull())][target],
normal_df[~(normal_df[target].isnull())][target],
alternative='greater')
each_listwise_target_diff.append([target, mean_diff, cohen_d, mannwhitneyu_test.pvalue,
small_df[~(small_df[target].isnull())][target].mean()])
each_listwise_target_df = pd.DataFrame(each_listwise_target_diff,
columns=['目的変数', '平均の差', 'Cohenのd', 'p値','小規模クラス平均値'])
each_listwise_target_df.head()
| 目的変数 | 平均の差 | Cohenのd | p値 | 小規模クラス平均値 | |
|---|---|---|---|---|---|
| 0 | GKSELFCO | 0.576933 | 0.111759 | 1.926606e-06 | 56.348329 |
| 1 | GKMOTIVR | 0.073794 | 0.029372 | 2.922247e-02 | 25.699871 |
| 2 | GKTREADS | 5.463244 | 0.172864 | 5.998701e-11 | 440.547441 |
| 3 | GKTMATHS | 7.935952 | 0.166880 | 1.985040e-08 | 490.931328 |
| 4 | GKTLISTS | 3.404809 | 0.102864 | 1.068373e-04 | 539.856817 |
# 2群が異なると判定(P値<0.05)され、かつ効果量が一定以上(Cohenのd>=0.2)以上ある目的変数
print(' 小規模クラスと大規模クラスの目的変数比較(欠損値はリストワイズ法で処理)')
each_listwise_target_df[(each_listwise_target_df['p値']<0.05)&(each_listwise_target_df['Cohenのd']>=0.2)]
小規模クラスと大規模クラスの目的変数比較(欠損値はリストワイズ法で処理)
| 目的変数 | 平均の差 | Cohenのd | p値 | 小規模クラス平均値 | |
|---|---|---|---|---|---|
| 7 | G1TREADS | 12.863027 | 0.234378 | 2.583858e-16 | 529.983544 |
| 8 | G1TMATHS | 11.355326 | 0.265307 | 5.752550e-21 | 538.670059 |
| 9 | G1TLISTS | 7.334839 | 0.218892 | 2.166503e-14 | 572.745690 |
| 12 | G2TREADS | 9.216221 | 0.201022 | 7.963208e-13 | 590.430323 |
| 17 | G3TREADS | 8.318068 | 0.216800 | 5.150025e-15 | 621.075718 |
※なお、少人数の方が教育効果が高いという結論を検証するために、以降全て片側検定としている
# 各種目的変数群で比較
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer
import itertools
each_iterative_imputed_target_diff = []
for key in tqdm(RCT_COMPARISON_EACH_YEAR.keys()):
TARGET_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
# 多重代入法に利用するカテゴリ変数をダミー変数化
# TODO: One-Hot-Encoderの実装変更
tmp_df = pd.get_dummies(students_df, columns=RCT_COMPARISON_EACH_YEAR[key]['CATEGORY_EXP_VARIABLES'])
tmp_df = tmp_df[tmp_df[STAR_FLAG_NAME]==1]
EXP_VARS = list(itertools.chain.from_iterable([RCT_COMPARISON_EACH_YEAR[key]['NUMERIC_EXP_VARIABLES'],
tmp_df[[s for s in tmp_df.columns if '.0' in s]]]))
for target in TARGET_COLUMNS_LIST:
tmp_list = list(itertools.chain.from_iterable([[target], EXP_VARS]))
# 上記の結果をもとに目的変数について多重代入法を実施する。
mi_list = IterativeImputer(max_iter=3).fit_transform(tmp_df[tmp_list])
mi_list = [x[0] for x in mi_list]
tmp_df[target] = mi_list
small_df = tmp_df[(tmp_df[STAR_FLAG_NAME]==1)&(tmp_df[CLASS_FLAG_NAME]==1)]
normal_df = tmp_df[(tmp_df[STAR_FLAG_NAME]==1)&(tmp_df[CLASS_FLAG_NAME].isin([2, 3]))]
# 効果量(平均値の差)
mean_diff = small_df[target].mean() - normal_df[target].mean()
# cohenのd
small_len = len(small_df[target]) - 1
normal_len = len(normal_df[target]) -1
small_var = np.var(small_df[target])
normal_var = np.var(normal_df[target])
cohen_d = mean_diff / np.sqrt((small_len * small_var + normal_len * normal_var) / (small_len + normal_len))
# マンホイットニーのU検定
mannwhitneyu_test = scipy.stats.mannwhitneyu(small_df[target],
normal_df[target],
alternative='greater')
each_iterative_imputed_target_diff.append([target, mean_diff, cohen_d, mannwhitneyu_test.pvalue,
small_df[~(small_df[target].isnull())][target].mean()])
each_iterative_imputed_target_df = pd.DataFrame(each_iterative_imputed_target_diff,
columns=['目的変数', '平均の差', 'Cohenのd', 'p値','小規模クラス平均値'])
0%| | 0/4 [00:00<?, ?it/s]
[IterativeImputer] Early stopping criterion not reached.
each_iterative_imputed_target_df.head()
| 目的変数 | 平均の差 | Cohenのd | p値 | 小規模クラス平均値 | |
|---|---|---|---|---|---|
| 0 | GKSELFCO | 0.467623 | 0.101471 | 4.105169e-07 | 56.277166 |
| 1 | GKMOTIVR | 0.060130 | 0.026814 | 1.040544e-02 | 25.689246 |
| 2 | GKTREADS | 5.021838 | 0.165522 | 6.014282e-11 | 440.223693 |
| 3 | GKTMATHS | 7.457092 | 0.162212 | 1.615182e-08 | 490.538451 |
| 4 | GKTLISTS | 3.282221 | 0.102621 | 5.831082e-05 | 539.702498 |
# 多重代入法で欠損値を埋めた後、2群が異なると判定(P値<0.05)され、かつ効果量が一定以上(Cohenのd>=0.2)以上ある目的変数
print(' 小規模クラスと大規模クラスの目的変数比較(欠損値は多重代入法で処理)')
each_iterative_imputed_target_df[(each_iterative_imputed_target_df['p値']<0.05)&(each_iterative_imputed_target_df['Cohenのd']>=0.2)]
小規模クラスと大規模クラスの目的変数比較(欠損値は多重代入法で処理)
| 目的変数 | 平均の差 | Cohenのd | p値 | 小規模クラス平均値 | |
|---|---|---|---|---|---|
| 7 | G1TREADS | 12.564653 | 0.236176 | 2.687729e-17 | 529.726177 |
| 8 | G1TMATHS | 11.146923 | 0.264760 | 1.587408e-21 | 538.478896 |
| 9 | G1TLISTS | 7.181223 | 0.218514 | 1.005026e-14 | 572.618536 |
| 12 | G2TREADS | 8.702871 | 0.200691 | 1.784230e-14 | 589.950785 |
| 17 | G3TREADS | 7.747531 | 0.214315 | 3.653447e-17 | 620.615174 |
※なお、少人数の方が教育効果が高いという結論を検証するために、以降全て片側検定としている
all_listwise_target_diff = []
TARGET_COLUMNS_LIST = RCT_COMPARISON_ALL_YEAR['TARGET_COLUMNS']
CLASS_FLAG_NAME = RCT_COMPARISON_ALL_YEAR['STAR_CLASS_TYPE']
small_df = students_df[(students_df[CLASS_FLAG_NAME]==1)]
normal_df = students_df[(students_df[CLASS_FLAG_NAME].isin([2, 3]))]
for target in TARGET_COLUMNS_LIST:
mean_diff = small_df[~(small_df[target].isnull())][target].mean() - normal_df[~(normal_df[target].isnull())][target].mean()
# cohenのd
small_len = len(small_df[~(small_df[target].isnull())][target]) - 1
normal_len = len(normal_df[~(normal_df[target].isnull())][target]) -1
small_var = np.var(small_df[~(small_df[target].isnull())][target])
normal_var = np.var(normal_df[~(normal_df[target].isnull())][target])
cohen_d = mean_diff / np.sqrt((small_len * small_var + normal_len * normal_var) / (small_len + normal_len))
mannwhitneyu_test = scipy.stats.mannwhitneyu(small_df[~(small_df[target].isnull())][target],
normal_df[~(normal_df[target].isnull())][target],
alternative='greater')
all_listwise_target_diff.append([target, mean_diff, cohen_d, mannwhitneyu_test.pvalue,
small_df[~(small_df[target].isnull())][target].mean()])
all_listwise_target_df = pd.DataFrame(all_listwise_target_diff,
columns=['目的変数', '平均の差', 'Cohenのd', 'p値','小規模クラス平均値'])
all_listwise_target_df
| 目的変数 | 平均の差 | Cohenのd | p値 | 小規模クラス平均値 | |
|---|---|---|---|---|---|
| 0 | STAR_LATEST_SELFCO | 0.133164 | 0.020834 | 0.353159 | 46.316078 |
| 1 | STAR_LATEST_MOTIVR | 0.024504 | 0.002710 | 0.505759 | 46.160782 |
| 2 | HSSATCON | 11.036458 | 0.057648 | 0.062980 | 906.814236 |
※なお、少人数の方が教育効果が高いという結論を検証するために、以降全て片側検定としている
# 各種目的変数群で比較
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer
import itertools
all_multiple_target_diff = []
TARGET_COLUMNS_LIST = RCT_COMPARISON_ALL_YEAR['TARGET_COLUMNS']
CLASS_FLAG_NAME = RCT_COMPARISON_ALL_YEAR['STAR_CLASS_TYPE']
small_df = students_df[(students_df[CLASS_FLAG_NAME]==1)]
normal_df = students_df[(students_df[CLASS_FLAG_NAME].isin([2, 3]))]
TARGET_COLUMNS_LIST = RCT_COMPARISON_ALL_YEAR['TARGET_COLUMNS']
CLASS_FLAG_NAME = RCT_COMPARISON_ALL_YEAR['STAR_CLASS_TYPE']
# 多重代入法に利用するカテゴリ変数をダミー変数化
# TODO: One-Hot-Encoderの実装変更
tmp_df = pd.get_dummies(students_df, columns=RCT_COMPARISON_ALL_YEAR['CATEGORY_EXP_VARIABLES'], drop_first=True)
tmp_df = tmp_df[tmp_df[STAR_FLAG_NAME]==1]
EXP_VARS = list(itertools.chain.from_iterable([RCT_COMPARISON_ALL_YEAR['NUMERIC_EXP_VARIABLES'],
tmp_df[[s for s in tmp_df.columns if '.0' in s]]]))
for target in TARGET_COLUMNS_LIST:
tmp_list = list(itertools.chain.from_iterable([[target], EXP_VARS]))
# 上記の結果をもとに目的変数について多重代入法を実施する。
mi_list = IterativeImputer(max_iter=3).fit_transform(tmp_df[tmp_list])
mi_list = [x[0] for x in mi_list]
tmp_df[target] = mi_list
small_df = tmp_df[(tmp_df[STAR_FLAG_NAME]==1)&(tmp_df[CLASS_FLAG_NAME]==1)]
normal_df = tmp_df[(tmp_df[STAR_FLAG_NAME]==1)&(tmp_df[CLASS_FLAG_NAME].isin([2, 3]))]
# 効果量(平均値の差)
mean_diff = small_df[target].mean() - normal_df[target].mean()
# cohenのd
small_len = len(small_df[target]) - 1
normal_len = len(normal_df[target]) -1
small_var = np.var(small_df[target])
normal_var = np.var(normal_df[target])
cohen_d = mean_diff / np.sqrt((small_len * small_var + normal_len * normal_var) / (small_len + normal_len))
# マンホイットニーのU検定
mannwhitneyu_test = scipy.stats.mannwhitneyu(small_df[target],
normal_df[target],
alternative='greater')
all_multiple_target_diff.append([target, mean_diff, cohen_d, mannwhitneyu_test.pvalue,
small_df[~(small_df[target].isnull())][target].mean()])
all_multiple_target_df = pd.DataFrame(all_multiple_target_diff,
columns=['目的変数', '平均の差', 'Cohenのd', 'p値','小規模クラス平均値'])
# 欠損値処理法によるHSSATCONの比較
concat_df = pd.concat([all_multiple_target_df[all_multiple_target_df.目的変数=='HSSATCON'], all_listwise_target_df[all_listwise_target_df.目的変数=='HSSATCON']])
concat_df['欠損値処理'] = ['リストワイズ', '多重代入法']
concat_df
| 目的変数 | 平均の差 | Cohenのd | p値 | 小規模クラス平均値 | 欠損値処理 | |
|---|---|---|---|---|---|---|
| 2 | HSSATCON | 13.875406 | 0.094790 | 0.000224 | 869.043469 | リストワイズ |
| 2 | HSSATCON | 11.036458 | 0.057648 | 0.062980 | 906.814236 | 多重代入法 |
# 各種説明変数群(カテゴリ変数)で比較
listwise_side_exp_cat = []
for key in RCT_COMPARISON_EACH_YEAR.keys():
CAT_EXP_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['CATEGORY_EXP_VARIABLES']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
small_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME]==1)]
normal_df = students_df[(students_df[STAR_FLAG_NAME]==1)&(students_df[CLASS_FLAG_NAME].isin([2, 3]))]
for target in CAT_EXP_COLUMNS_LIST:
# 小規模クラス群と通常クラス群間について、各カテゴリ変数ごとの実測度数を算出する
tmp_target_small_array = []
tmp_target_normal_array = []
for category_value in np.sort(students_df[~students_df[target].isnull()][target].unique()):
if ((small_df[target]==category_value).sum()==0)&((normal_df[target]==category_value).sum()==0):
pass
else:
tmp_target_small_array.append((small_df[target]==category_value).sum())
tmp_target_normal_array.append((normal_df[target]==category_value).sum())
# 上記で得られた度数を元に、カイ二乗検定によるp値とクラメールの連関係数を算出する
x2, p_value, _, _ = scipy.stats.chi2_contingency(np.array([tmp_target_small_array,
tmp_target_normal_array]),
correction=True)
total_num = sum(tmp_target_small_array) + sum(tmp_target_normal_array)
cramer_v = np.sqrt(x2 / (total_num * (np.min([len(tmp_target_small_array), len(tmp_target_normal_array)]) - 1)))
listwise_side_exp_cat.append([key + '_' + target, p_value, cramer_v])
listwise_side_exp_cat_df = pd.DataFrame(listwise_side_exp_cat, columns=['説明変数', 'p値', 'クラメールの連関係数'])
C:\Users\koji.nishimura\AppData\Local\Temp\ipykernel_21308\4255097293.py:30: RuntimeWarning: invalid value encountered in divide cramer_v = np.sqrt(x2 / (total_num * (np.min([len(tmp_target_small_array), len(tmp_target_normal_array)]) - 1)))
listwise_side_exp_cat_df.head()
| 説明変数 | p値 | クラメールの連関係数 | |
|---|---|---|---|
| 0 | GK_GENDER | 0.990568 | 0.000149 |
| 1 | GK_RACE | 0.083551 | 0.017536 |
| 2 | GK_ENTRANCE_GRADE | 0.472784 | 0.010892 |
| 3 | GK_GKSURBAN | 0.054367 | 0.020050 |
| 4 | GK_GKFREELU | 0.167993 | 0.017370 |
print('小規模クラスと大規模クラスのデモグラフィック変数(カテゴリカル)比較(欠損値はリストワイズ法で処理)')
listwise_side_exp_cat_df[(listwise_side_exp_cat_df.p値<0.05)&(listwise_side_exp_cat_df.クラメールの連関係数>0.1)]
小規模クラスと大規模クラスのデモグラフィック変数(カテゴリカル)比較(欠損値はリストワイズ法で処理)
| 説明変数 | p値 | クラメールの連関係数 | |
|---|---|---|---|
| 14 | G1_FLAGGK | 4.582738e-42 | 0.164454 |
| 17 | G1_G1TGEN | 2.933399e-17 | 0.102386 |
| 26 | G2_FLAGGK | 6.686870e-39 | 0.157744 |
| 38 | G3_FLAGGK | 2.118710e-27 | 0.131489 |
多重代入法の推定器はベイジアンリッジ回帰
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer
import itertools
from tqdm.notebook import tqdm
from sklearn.preprocessing import OneHotEncoder
multiple_impute_side_exp_cat = []
for key in tqdm(RCT_COMPARISON_EACH_YEAR.keys()):
CAT_EXP_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['CATEGORY_EXP_VARIABLES']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
tmp_df = students_df[(students_df[STAR_FLAG_NAME]==1)]
tmp_df = tmp_df.astype({'FLAGGK': 'float64'})
for target in tqdm(CAT_EXP_COLUMNS_LIST):
# 多重代入法による処理を実装する(ランダムフォレスト回帰)
target_array = np.array(tmp_df[target])
if len(tmp_df[target].unique())==1:
pass
else:
tmp_exp_list = CAT_EXP_COLUMNS_LIST
tmp_exp_list.remove(target)
# TODO: One-Hot-Encoderの実装変更
tmp_df_dummies = pd.get_dummies(tmp_df, columns=tmp_exp_list,
drop_first=True)
tmp_df_dummies[target] = target_array
EXP_VARS = list(itertools.chain.from_iterable([[target], RCT_COMPARISON_ALL_YEAR['NUMERIC_EXP_VARIABLES'],
tmp_df_dummies[[s for s in tmp_df_dummies.columns if '.0' in s]]]))
# TODO: IterativeImputer周りの実装変更。収束しなかった、エラーが出た場合は、現在は一旦全部無視している
try:
mi_list = IterativeImputer(max_iter=100).fit_transform(tmp_df_dummies[EXP_VARS])
mi_list = [x[0] for x in mi_list]
tmp_df[target] = mi_list
mi_df = tmp_df.copy()
mi_df[target] = mi_list
small_df = mi_df[(mi_df[STAR_FLAG_NAME]==1)&(mi_df[CLASS_FLAG_NAME]==1)]
normal_df = mi_df[(mi_df[STAR_FLAG_NAME]==1)&(mi_df[CLASS_FLAG_NAME].isin([2, 3]))]
# 小規模クラス群と通常クラス群間について、各カテゴリ変数ごとの実測度数を算出する
tmp_target_small_array = []
tmp_target_normal_array = []
for category_value in np.sort(students_df[~students_df[target].isnull()][target].unique()):
if ((small_df[target]==category_value).sum()==0)&((normal_df[target]==category_value).sum()==0):
pass
else:
tmp_target_small_array.append((small_df[target]==category_value).sum())
tmp_target_normal_array.append((normal_df[target]==category_value).sum())
# 上記で得られた度数を元に、カイ二乗検定によるp値とクラメールの連関係数を算出する
x2, p_value, _, _ = scipy.stats.chi2_contingency(np.array([tmp_target_small_array,
tmp_target_normal_array]),
correction=True)
total_num = sum(tmp_target_small_array) + sum(tmp_target_normal_array)
cramer_v = np.sqrt(x2 / (total_num * (np.min([len(tmp_target_small_array), len(tmp_target_normal_array)]) - 1)))
mannwhitneyu_test = scipy.stats.mannwhitneyu(small_df[~(small_df[target].isnull())][target],
normal_df[~(normal_df[target].isnull())][target],
alternative='greater')
multiple_impute_side_exp_cat.append([key + '_' + target, mannwhitneyu_test.pvalue, cramer_v])
except:
multiple_impute_side_exp_cat.append([key + '_' + target, np.nan, np.nan])
multiple_side_exp_cat_df = pd.DataFrame(multiple_impute_side_exp_cat, columns=['説明変数', 'p値', 'クラメールの連関係数'])
0%| | 0/4 [00:00<?, ?it/s]
0%| | 0/3 [00:00<?, ?it/s]
0%| | 0/3 [00:00<?, ?it/s]
0%| | 0/3 [00:00<?, ?it/s]
0%| | 0/3 [00:00<?, ?it/s]
print('小規模クラスと大規模クラスのデモグラフィック変数(カテゴリカル)比較(欠損値は多重代入法で処理)')
multiple_side_exp_cat_df[(multiple_side_exp_cat_df.p値<0.05)&(multiple_side_exp_cat_df.クラメールの連関係数>0.1)]
小規模クラスと大規模クラスのデモグラフィック変数(カテゴリカル)比較(欠損値は多重代入法で処理)
| 説明変数 | p値 | クラメールの連関係数 | |
|---|---|---|---|
| 2 | G1_FLAGGK | 1.588693e-42 | 0.164454 |
| 4 | G2_FLAGGK | 2.389984e-39 | 0.157744 |
| 6 | G3_FLAGGK | 8.028382e-28 | 0.131489 |
print('小規模クラスと大規模クラスのデモグラフィック変数(カテゴリカル)比較(欠損値は多重代入法で処理)')
multiple_side_exp_cat_df[(multiple_side_exp_cat_df.クラメールの連関係数.isnull())]
小規模クラスと大規模クラスのデモグラフィック変数(カテゴリカル)比較(欠損値は多重代入法で処理)
| 説明変数 | p値 | クラメールの連関係数 |
|---|
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer
import itertools
from tqdm.notebook import tqdm
propensity_score = {}
for key in tqdm(RCT_COMPARISON_EACH_YEAR.keys()):
TARGET_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
CLASS_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['CLASS_FLAG']
EXP_VARS = list(itertools.chain.from_iterable([RCT_COMPARISON_EACH_YEAR[key]['NUMERIC_EXP_VARIABLES'],
RCT_COMPARISON_EACH_YEAR[key]['CATEGORY_EXP_VARIABLES']]))
# 多重代入法による処理を実装する
tmp_df = students_df[(students_df[STAR_FLAG_NAME]==1)].copy()
tmp_list = list(itertools.chain.from_iterable([[target], EXP_VARS]))
#mi_list = IterativeImputer(RandomForestRegressor(random_state=0), max_iter=3).fit_transform(tmp_df[tmp_list])
mi_list = IterativeImputer(max_iter=100).fit_transform(tmp_df[tmp_list])
#mi_list = [x[0] for x in mi_list]
tmp_df[tmp_list] = mi_list
tmp_df['class_flag_binary'] = tmp_df[CLASS_FLAG_NAME].apply(lambda x : 1 if x == 1 else 2)
# プロペンシティスコアを算出
clf = LogisticRegression(max_iter=1000, penalty='l2')
result = clf.fit(tmp_df[EXP_VARS], tmp_df['class_flag_binary'])
ps = result.predict_proba(tmp_df[EXP_VARS])
propensity_score[key] = ps
0%| | 0/4 [00:00<?, ?it/s]
ipw = []
for key in tqdm(RCT_COMPARISON_EACH_YEAR.keys()):
TARGET_COLUMNS_LIST = RCT_COMPARISON_EACH_YEAR[key]['TARGET_COLUMNS']
STAR_FLAG_NAME = RCT_COMPARISON_EACH_YEAR[key]['STAR_FLAG']
tmp_df = students_df[(students_df[STAR_FLAG_NAME]==1)].copy()
tmp_df['propensity_score'] = [x[0] for x in propensity_score[key]]
for target in tqdm(TARGET_COLUMNS_LIST):
small_df = tmp_df[(tmp_df[CLASS_FLAG_NAME]==1)].copy()
normal_df = tmp_df[(tmp_df[CLASS_FLAG_NAME].isin([2, 3]))].copy()
e_1 = np.sum((1 /small_df['propensity_score']) * small_df[target]) / len(students_df)
e_0 = np.sum((1 /(1-normal_df['propensity_score'])) * normal_df[target]) / len(students_df)
ipw.append([key, target, e_1-e_0])
0%| | 0/4 [00:00<?, ?it/s]
0%| | 0/5 [00:00<?, ?it/s]
0%| | 0/5 [00:00<?, ?it/s]
0%| | 0/5 [00:00<?, ?it/s]
0%| | 0/5 [00:00<?, ?it/s]
ipw_df = pd.DataFrame(ipw, columns=['STAR学年', '変数', '平均の差'])
tmp_df = each_iterative_imputed_target_df.merge(ipw_df, left_on='目的変数', right_on='変数')
tmp_df = tmp_df[['目的変数', '平均の差_x', '平均の差_y']]
tmp_df.columns = ['変数名', 'ATE', 'ATE(傾向スコア補正済)']
tmp_df
| 変数名 | ATE | ATE(傾向スコア補正済) | |
|---|---|---|---|
| 0 | GKSELFCO | 0.467623 | 6.099352 |
| 1 | GKMOTIVR | 0.060130 | 2.749619 |
| 2 | GKTREADS | 5.021838 | 50.842831 |
| 3 | GKTMATHS | 7.457092 | 56.373371 |
| 4 | GKTLISTS | 3.282221 | 59.573082 |
| 5 | G1SELFCO | 0.347876 | 7.194383 |
| 6 | G1MOTIVR | 0.051185 | 7.844183 |
| 7 | G1TREADS | 12.564653 | 87.749201 |
| 8 | G1TMATHS | 11.146923 | 93.131033 |
| 9 | G1TLISTS | 7.181223 | 97.499399 |
| 10 | G2SELFCO | 0.007160 | -7.713263 |
| 11 | G2MOTIVR | -0.066738 | -8.164959 |
| 12 | G2TREADS | 8.702871 | -101.393200 |
| 13 | G2TMATHS | 8.020971 | -96.743975 |
| 14 | G2TLISTS | 5.824754 | -97.541888 |
| 15 | G3SELFCO | 0.091119 | -9.872423 |
| 16 | G3MOTIVR | 0.014178 | -10.805804 |
| 17 | G3TREADS | 7.747531 | -140.404814 |
| 18 | G3TMATHS | 6.869544 | -142.543356 |
| 19 | G3TLISTS | 3.590759 | -141.992347 |
fig, axes = plt.subplots(facecolor="white", nrows=2, ncols=2, figsize=(10, 8))
data = [x[0] for x in propensity_score['GK']]
axes[0, 0].hist(data, bins=30)
axes[0, 0].set_title('GK')
data = [x[0] for x in propensity_score['G1']]
axes[0, 1].hist(data, bins=30)
axes[0, 1].set_title('G1')
data = [x[0] for x in propensity_score['G2']]
axes[1, 0].hist(data, bins=30)
axes[1, 0].set_title('G2')
data = [x[0] for x in propensity_score['G3']]
axes[1, 1].hist(data, bins=30)
axes[1, 1].set_title('G3')
plt.show()